# Session 1.15: String Processing and Text Analysis

## **Essential for Healthcare Text Data and PySpark String Functions**

### **Learning Objectives**
By the end of this session, you will:
- Master advanced string manipulation techniques
- Process healthcare text data effectively
- Apply string methods essential for PySpark text processing
- Build text analysis pipelines for clinical data

---

### **Relevance to PySpark**
String processing is fundamental in PySpark for cleaning text data, parsing structured text, and preparing data for analysis. These skills directly translate to PySpark's string functions.

---

## 1. Advanced String Manipulation

In [None]:
# Healthcare text data examples
patient_notes = [
    "Patient: John Doe, Age: 45, Chief Complaint: Chest pain and shortness of breath",
    "Patient: Jane Smith, Age: 32, Chief Complaint: Headache, nausea, and dizziness",
    "Patient: Bob Johnson, Age: 58, Chief Complaint: Joint pain in knees and back",
    "Patient: Alice Brown, Age: 28, Chief Complaint: Persistent cough and fever"
]

medication_data = [
    "Lisinopril 10mg - Take once daily for blood pressure",
    "Metformin 500mg - Take twice daily with meals for diabetes",
    "Ibuprofen 200mg - Take as needed for pain, max 3 times daily",
    "Omeprazole 20mg - Take once daily before breakfast for acid reflux"
]

# String cleaning and normalization
def clean_patient_name(name_string):
    """Clean and normalize patient names."""
    # Remove extra whitespace and convert to title case
    cleaned = ' '.join(name_string.strip().split())
    
    # Handle common abbreviations
    cleaned = cleaned.replace(' Jr.', ' Jr').replace(' Sr.', ' Sr')
    cleaned = cleaned.replace(' III', ' III').replace(' II', ' II')
    
    return cleaned.title()

def extract_patient_info(note):
    """Extract structured information from patient notes."""
    # Extract patient name
    name_start = note.find("Patient: ") + len("Patient: ")
    name_end = note.find(", Age:")
    patient_name = note[name_start:name_end]
    
    # Extract age
    age_start = note.find("Age: ") + len("Age: ")
    age_end = note.find(", Chief Complaint:")
    age = int(note[age_start:age_end])
    
    # Extract chief complaint
    complaint_start = note.find("Chief Complaint: ") + len("Chief Complaint: ")
    chief_complaint = note[complaint_start:]
    
    return {
        'name': clean_patient_name(patient_name),
        'age': age,
        'chief_complaint': chief_complaint.strip()
    }

print("Extracting Patient Information:")
print("=" * 40)

for note in patient_notes:
    patient_info = extract_patient_info(note)
    print(f"Name: {patient_info['name']}")
    print(f"Age: {patient_info['age']}")
    print(f"Complaint: {patient_info['chief_complaint']}")
    print()

# String validation and formatting
def validate_medication_format(med_string):
    """Validate medication string format."""
    # Check for required components: medication name, dosage, instructions
    if ' - ' not in med_string:
        return False, "Missing instructions separator"
    
    medication_part, instructions = med_string.split(' - ', 1)
    
    # Check for dosage pattern (number + mg/g/ml)
    import re
    dosage_pattern = r'\d+(?:\.\d+)?(?:mg|g|ml|mcg)'
    if not re.search(dosage_pattern, medication_part):
        return False, "Missing or invalid dosage format"
    
    if len(instructions.strip()) < 5:
        return False, "Instructions too short"
    
    return True, "Valid format"

print("Medication Format Validation:")
print("=" * 35)

for med in medication_data:
    is_valid, message = validate_medication_format(med)
    status = "✓" if is_valid else "✗"
    print(f"{status} {med}")
    if not is_valid:
        print(f"   Error: {message}")
    print()

## 2. Regular Expressions for Healthcare Data

In [None]:
import re
from datetime import datetime

# Sample clinical text data
clinical_notes = [
    "Patient presented on 2024-01-15 with BP 140/90 mmHg, HR 85 bpm, temp 98.6°F",
    "Lab results: Glucose 145 mg/dL, Cholesterol 220 mg/dL, HbA1c 7.2%",
    "Prescribed: Lisinopril 10mg QD, Metformin 500mg BID with meals",
    "Follow-up scheduled for 2024-02-15 at 2:30 PM in cardiology clinic",
    "Patient weight: 185 lbs, height: 5'10\", BMI calculated at 26.5"
]

# Regular expression patterns for healthcare data
patterns = {
    'date': r'\d{4}-\d{2}-\d{2}',
    'blood_pressure': r'\d{2,3}/\d{2,3}\s*mmHg',
    'heart_rate': r'\d{2,3}\s*bpm',
    'temperature': r'\d{2,3}\.\d\s*°F',
    'lab_value': r'\w+\s+\d+(?:\.\d+)?\s*(?:mg/dL|%)',
    'medication': r'[A-Za-z]+\s+\d+mg\s+(?:QD|BID|TID|QID)',
    'time': r'\d{1,2}:\d{2}\s*(?:AM|PM)',
    'weight': r'\d+(?:\.\d+)?\s*lbs',
    'height': r"\d'\d+\"",
    'bmi': r'BMI.*?\d+\.\d+'
}

def extract_clinical_data(text, pattern_name):
    """Extract clinical data using regular expressions."""
    pattern = patterns.get(pattern_name)
    if not pattern:
        return []
    
    matches = re.findall(pattern, text, re.IGNORECASE)
    return matches

def parse_vital_signs(text):
    """Parse vital signs from clinical text."""
    vitals = {}
    
    # Blood pressure
    bp_match = re.search(r'BP\s+(\d{2,3})/(\d{2,3})', text, re.IGNORECASE)
    if bp_match:
        vitals['systolic'] = int(bp_match.group(1))
        vitals['diastolic'] = int(bp_match.group(2))
    
    # Heart rate
    hr_match = re.search(r'HR\s+(\d{2,3})', text, re.IGNORECASE)
    if hr_match:
        vitals['heart_rate'] = int(hr_match.group(1))
    
    # Temperature
    temp_match = re.search(r'temp\s+(\d{2,3}\.\d)', text, re.IGNORECASE)
    if temp_match:
        vitals['temperature'] = float(temp_match.group(1))
    
    return vitals

def parse_lab_results(text):
    """Parse laboratory results from clinical text."""
    lab_results = {}
    
    # Find all lab values
    lab_pattern = r'(\w+)\s+(\d+(?:\.\d+)?)\s*(mg/dL|%)'
    matches = re.findall(lab_pattern, text, re.IGNORECASE)
    
    for test_name, value, unit in matches:
        lab_results[test_name.lower()] = {
            'value': float(value),
            'unit': unit
        }
    
    return lab_results

def parse_medications(text):
    """Parse medication information from clinical text."""
    medications = []
    
    # Pattern for medication with dosage and frequency
    med_pattern = r'([A-Za-z]+)\s+(\d+)mg\s+(QD|BID|TID|QID)'
    matches = re.findall(med_pattern, text, re.IGNORECASE)
    
    frequency_map = {
        'QD': 'Once daily',
        'BID': 'Twice daily', 
        'TID': 'Three times daily',
        'QID': 'Four times daily'
    }
    
    for med_name, dosage, frequency in matches:
        medications.append({
            'name': med_name.title(),
            'dosage': f"{dosage}mg",
            'frequency': frequency_map.get(frequency.upper(), frequency)
        })
    
    return medications

print("Clinical Data Extraction with Regular Expressions:")
print("=" * 55)

for i, note in enumerate(clinical_notes, 1):
    print(f"\nNote {i}: {note}")
    print("Extracted data:")
    
    # Extract different types of data
    dates = extract_clinical_data(note, 'date')
    if dates:
        print(f"  Dates: {dates}")
    
    vitals = parse_vital_signs(note)
    if vitals:
        print(f"  Vital signs: {vitals}")
    
    labs = parse_lab_results(note)
    if labs:
        print(f"  Lab results: {labs}")
    
    meds = parse_medications(note)
    if meds:
        print(f"  Medications: {meds}")
    
    times = extract_clinical_data(note, 'time')
    if times:
        print(f"  Times: {times}")

## 3. Text Cleaning and Normalization

In [None]:
# Sample messy healthcare text data
messy_clinical_data = [
    "   Patient: JOHN    DOE,  age: 45   Chief complaint:    chest pain    ",
    "Diagnosis: Type II Diabetes Mellitus, Hypertension, hyperlipidemia",
    "MEDICATIONS: lisinopril   10mg,  metformin 500MG,  simvastatin  20mg",
    "Allergies: PCN (penicillin),  NKDA, shellfish   ",
    "Social Hx: Smokes 1 ppd x 20 years, ETOH: 2-3 drinks/week, denies illicit drugs"
]

def normalize_text(text):
    """Normalize and clean clinical text data."""
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Standardize case for certain patterns
    text = re.sub(r'\bmg\b', 'mg', text, flags=re.IGNORECASE)
    text = re.sub(r'\bml\b', 'mL', text, flags=re.IGNORECASE)
    
    # Standardize medical abbreviations
    abbreviations = {
        r'\bPCN\b': 'Penicillin',
        r'\bNKDA\b': 'No Known Drug Allergies',
        r'\bETOH\b': 'Alcohol',
        r'\bppd\b': 'packs per day',
        r'\bHx\b': 'History',
        r'\bType II\b': 'Type 2'
    }
    
    for abbrev, expansion in abbreviations.items():
        text = re.sub(abbrev, expansion, text, flags=re.IGNORECASE)
    
    return text.strip()

def standardize_medication_names(text):
    """Standardize medication names to proper case."""
    # Common medication names with proper capitalization
    med_corrections = {
        'lisinopril': 'Lisinopril',
        'metformin': 'Metformin',
        'simvastatin': 'Simvastatin',
        'amlodipine': 'Amlodipine',
        'omeprazole': 'Omeprazole',
        'ibuprofen': 'Ibuprofen'
    }
    
    for incorrect, correct in med_corrections.items():
        pattern = r'\b' + re.escape(incorrect) + r'\b'
        text = re.sub(pattern, correct, text, flags=re.IGNORECASE)
    
    return text

def extract_structured_data(text):
    """Extract structured data from normalized clinical text."""
    structured_data = {}
    
    # Extract patient name
    name_match = re.search(r'Patient:\s*([^,]+)', text, re.IGNORECASE)
    if name_match:
        structured_data['patient_name'] = name_match.group(1).strip().title()
    
    # Extract age
    age_match = re.search(r'age:\s*(\d+)', text, re.IGNORECASE)
    if age_match:
        structured_data['age'] = int(age_match.group(1))
    
    # Extract diagnoses
    diagnosis_match = re.search(r'Diagnosis:\s*([^\n]+)', text, re.IGNORECASE)
    if diagnosis_match:
        diagnoses = [d.strip() for d in diagnosis_match.group(1).split(',')]
        structured_data['diagnoses'] = diagnoses
    
    # Extract medications
    med_match = re.search(r'MEDICATIONS:\s*([^\n]+)', text, re.IGNORECASE)
    if med_match:
        medications = [m.strip() for m in med_match.group(1).split(',')]
        structured_data['medications'] = medications
    
    # Extract allergies
    allergy_match = re.search(r'Allergies:\s*([^\n]+)', text, re.IGNORECASE)
    if allergy_match:
        allergies = [a.strip() for a in allergy_match.group(1).split(',')]
        structured_data['allergies'] = allergies
    
    return structured_data

print("Text Cleaning and Normalization:")
print("=" * 40)

for i, messy_text in enumerate(messy_clinical_data, 1):
    print(f"\nOriginal {i}: {messy_text}")
    
    # Clean and normalize
    normalized = normalize_text(messy_text)
    normalized = standardize_medication_names(normalized)
    
    print(f"Cleaned  {i}: {normalized}")
    
    # Extract structured data if possible
    structured = extract_structured_data(normalized)
    if structured:
        print(f"Structured: {structured}")

# Demonstrate text preprocessing pipeline
def clinical_text_preprocessing_pipeline(text_list):
    """Complete text preprocessing pipeline for clinical data."""
    processed_data = []
    
    for text in text_list:
        # Step 1: Basic cleaning
        cleaned = normalize_text(text)
        
        # Step 2: Standardize medical terms
        standardized = standardize_medication_names(cleaned)
        
        # Step 3: Extract structured data
        structured = extract_structured_data(standardized)
        
        # Step 4: Add metadata
        processed_record = {
            'original_text': text,
            'cleaned_text': standardized,
            'structured_data': structured,
            'processing_timestamp': datetime.now().isoformat()
        }
        
        processed_data.append(processed_record)
    
    return processed_data

print("\n\nComplete Preprocessing Pipeline:")
print("=" * 40)

pipeline_results = clinical_text_preprocessing_pipeline(messy_clinical_data[:2])

for i, result in enumerate(pipeline_results, 1):
    print(f"\nResult {i}:")
    print(f"  Original: {result['original_text'][:50]}...")
    print(f"  Cleaned: {result['cleaned_text']}")
    print(f"  Structured: {result['structured_data']}")
    print(f"  Processed: {result['processing_timestamp']}")

## 4. Text Analysis and Statistics

In [None]:
from collections import Counter
import string

# Sample clinical text corpus
clinical_corpus = [
    "Patient presents with acute myocardial infarction. Administered aspirin and nitroglycerin.",
    "Chronic obstructive pulmonary disease exacerbation. Prescribed bronchodilators and steroids.",
    "Type 2 diabetes mellitus with poor glycemic control. Increased Metformin dosage.",
    "Hypertension well controlled on current medications. Continue Lisinopril therapy.",
    "Patient reports chest pain and shortness of breath. Ordered cardiac enzymes and ECG.",
    "Pneumonia confirmed by chest X-ray. Started on antibiotic therapy immediately.",
    "Acute kidney injury secondary to dehydration. Initiated fluid resuscitation protocol.",
    "Diabetic ketoacidosis treated with insulin infusion and fluid replacement therapy."
]

def tokenize_medical_text(text):
    """Tokenize medical text, preserving medical terms."""
    # Convert to lowercase and remove punctuation
    text = text.lower()
    
    # Remove punctuation but preserve hyphens in medical terms
    translator = str.maketrans('', '', string.punctuation.replace('-', ''))
    text = text.translate(translator)
    
    # Split into tokens
    tokens = text.split()
    
    return tokens

def calculate_text_statistics(text_list):
    """Calculate comprehensive text statistics."""
    all_tokens = []
    sentence_lengths = []
    word_lengths = []
    
    for text in text_list:
        tokens = tokenize_medical_text(text)
        all_tokens.extend(tokens)
        sentence_lengths.append(len(tokens))
        word_lengths.extend([len(token) for token in tokens])
    
    # Calculate statistics
    total_tokens = len(all_tokens)
    unique_tokens = len(set(all_tokens))
    avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths)
    avg_word_length = sum(word_lengths) / len(word_lengths)
    
    # Most common words
    word_counts = Counter(all_tokens)
    most_common = word_counts.most_common(10)
    
    return {
        'total_documents': len(text_list),
        'total_tokens': total_tokens,
        'unique_tokens': unique_tokens,
        'lexical_diversity': unique_tokens / total_tokens,
        'avg_sentence_length': round(avg_sentence_length, 2),
        'avg_word_length': round(avg_word_length, 2),
        'most_common_words': most_common
    }

def extract_medical_entities(text_list):
    """Extract common medical entities from text."""
    # Define medical term patterns
    conditions_pattern = r'\b(?:hypertension|diabetes|pneumonia|asthma|copd|myocardial infarction|kidney injury|ketoacidosis)\b'
    medications_pattern = r'\b(?:aspirin|metformin|lisinopril|insulin|nitroglycerin|bronchodilators|steroids|antibiotics?)\b'
    procedures_pattern = r'\b(?:ecg|x-ray|ct scan|mri|blood test|cardiac enzymes)\b'
    
    all_text = ' '.join(text_list).lower()
    
    conditions = re.findall(conditions_pattern, all_text, re.IGNORECASE)
    medications = re.findall(medications_pattern, all_text, re.IGNORECASE)
    procedures = re.findall(procedures_pattern, all_text, re.IGNORECASE)
    
    return {
        'conditions': Counter(conditions),
        'medications': Counter(medications),
        'procedures': Counter(procedures)
    }

def analyze_treatment_patterns(text_list):
    """Analyze treatment patterns in clinical text."""
    treatment_keywords = {
        'medication': ['prescribed', 'administered', 'started', 'given', 'continued'],
        'procedure': ['ordered', 'performed', 'conducted', 'scheduled'],
        'therapy': ['therapy', 'treatment', 'protocol', 'intervention']
    }
    
    pattern_counts = {category: 0 for category in treatment_keywords}
    
    for text in text_list:
        text_lower = text.lower()
        for category, keywords in treatment_keywords.items():
            for keyword in keywords:
                pattern_counts[category] += text_lower.count(keyword)
    
    return pattern_counts

print("Clinical Text Analysis:")
print("=" * 30)

# Calculate text statistics
stats = calculate_text_statistics(clinical_corpus)
print("Text Statistics:")
for key, value in stats.items():
    if key != 'most_common_words':
        print(f"  {key.replace('_', ' ').title()}: {value}")

print(f"\nMost Common Words:")
for word, count in stats['most_common_words']:
    print(f"  {word}: {count}")

# Extract medical entities
print("\nMedical Entity Extraction:")
entities = extract_medical_entities(clinical_corpus)

for entity_type, entity_counts in entities.items():
    print(f"\n{entity_type.title()}:")
    for entity, count in entity_counts.most_common():
        print(f"  {entity}: {count}")

# Analyze treatment patterns
print("\nTreatment Pattern Analysis:")
treatment_patterns = analyze_treatment_patterns(clinical_corpus)
for pattern, count in treatment_patterns.items():
    print(f"  {pattern.title()} mentions: {count}")

# Document similarity analysis
def calculate_document_similarity(doc1, doc2):
    """Calculate similarity between two documents using Jaccard similarity."""
    tokens1 = set(tokenize_medical_text(doc1))
    tokens2 = set(tokenize_medical_text(doc2))
    
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    
    if len(union) == 0:
        return 0.0
    
    return len(intersection) / len(union)

print("\nDocument Similarity Analysis:")
print("(Using first 3 documents for demo)")

for i in range(3):
    for j in range(i+1, 3):
        similarity = calculate_document_similarity(clinical_corpus[i], clinical_corpus[j])
        print(f"Document {i+1} vs Document {j+1}: {similarity:.3f} similarity")
        print(f"  Doc {i+1}: {clinical_corpus[i][:50]}...")
        print(f"  Doc {j+1}: {clinical_corpus[j][:50]}...")
        print()

## 5. String Performance and Memory Optimization

In [None]:
import time
import sys

def string_performance_comparison():
    """Compare different string operation performance patterns."""
    
    # Sample data
    patient_ids = [f"PT{i:06d}" for i in range(1000)]
    
    print("String Performance Comparison:")
    print("=" * 40)
    
    # Method 1: String concatenation with +
    start_time = time.time()
    result1 = ""
    for pid in patient_ids:
        result1 += pid + ","
    result1 = result1.rstrip(",")
    concat_time = time.time() - start_time
    
    # Method 2: Using join()
    start_time = time.time()
    result2 = ",".join(patient_ids)
    join_time = time.time() - start_time
    
    # Method 3: Using list comprehension and join
    start_time = time.time()
    result3 = ",".join([pid for pid in patient_ids])
    list_comp_time = time.time() - start_time
    
    # Method 4: Using generator and join
    start_time = time.time()
    result4 = ",".join(pid for pid in patient_ids)
    generator_time = time.time() - start_time
    
    print(f"String concatenation (+): {concat_time:.6f} seconds")
    print(f"Join method:             {join_time:.6f} seconds")
    print(f"List comprehension:      {list_comp_time:.6f} seconds")
    print(f"Generator expression:    {generator_time:.6f} seconds")
    
    print(f"\nPerformance improvement with join(): {concat_time/join_time:.1f}x faster")
    
    # Verify results are identical
    assert result1 == result2 == result3 == result4
    print("✓ All methods produce identical results")

def memory_efficient_text_processing():
    """Demonstrate memory-efficient text processing techniques."""
    
    def process_large_text_file_inefficient(lines):
        """Inefficient approach - loads everything into memory."""
        all_data = []
        for line in lines:
            # Simulate processing
            processed = line.strip().upper()
            all_data.append(processed)
        return all_data
    
    def process_large_text_file_efficient(lines):
        """Efficient approach - processes line by line."""
        for line in lines:
            # Simulate processing
            processed = line.strip().upper()
            yield processed
    
    # Generate sample data
    sample_lines = [f"Patient record {i}: medical data here\n" for i in range(1000)]
    
    print("\nMemory Usage Comparison:")
    print("=" * 30)
    
    # Inefficient method
    start_time = time.time()
    inefficient_result = process_large_text_file_inefficient(sample_lines)
    inefficient_time = time.time() - start_time
    inefficient_memory = sys.getsizeof(inefficient_result)
    
    # Efficient method
    start_time = time.time()
    efficient_result = process_large_text_file_efficient(sample_lines)
    # Process first 5 items to demonstrate
    first_five = [next(efficient_result) for _ in range(5)]
    efficient_time = time.time() - start_time
    efficient_memory = sys.getsizeof(efficient_result)
    
    print(f"Inefficient approach:")
    print(f"  Time: {inefficient_time:.6f} seconds")
    print(f"  Memory: {inefficient_memory:,} bytes")
    print(f"  Records: {len(inefficient_result)}")
    
    print(f"\nEfficient approach:")
    print(f"  Time (first 5): {efficient_time:.6f} seconds")
    print(f"  Memory: {efficient_memory:,} bytes")
    print(f"  Sample: {first_five[0] if first_five else 'None'}")
    
    memory_savings = ((inefficient_memory - efficient_memory) / inefficient_memory) * 100
    print(f"\nMemory savings: {memory_savings:.1f}%")

# Run performance comparisons
string_performance_comparison()
memory_efficient_text_processing()

# String interning demonstration
def string_interning_demo():
    """Demonstrate string interning for memory optimization."""
    
    print("\nString Interning for Medical Codes:")
    print("=" * 40)
    
    # Common medical codes that appear frequently
    common_codes = ['ICD10-E11.9', 'ICD10-I10', 'ICD10-J44.1', 'CPT-99213']
    
    # Without interning - new string objects
    codes_without_interning = []
    for _ in range(100):
        for code in common_codes:
            codes_without_interning.append(str(code))  # Force new string creation
    
    # With interning - reuse string objects
    codes_with_interning = []
    for _ in range(100):
        for code in common_codes:
            codes_with_interning.append(sys.intern(code))
    
    # Check memory usage
    memory_without = sys.getsizeof(codes_without_interning)
    memory_with = sys.getsizeof(codes_with_interning)
    
    print(f"Without interning: {memory_without:,} bytes")
    print(f"With interning:    {memory_with:,} bytes")
    
    # Test object identity
    code1 = sys.intern('ICD10-E11.9')
    code2 = sys.intern('ICD10-E11.9')
    print(f"\nInterned strings are identical objects: {code1 is code2}")
    print(f"This enables fast comparison and reduced memory usage")

string_interning_demo()

## 6. Practice Exercise

Build a comprehensive clinical text processing system.

In [None]:
# Exercise: Clinical Notes Processing System
# Build a system that processes clinical notes and extracts structured information

sample_clinical_notes = [
    "Patient: John Smith, DOB: 1975-03-15, MRN: 123456. Chief complaint: severe chest pain radiating to left arm, onset 2 hours ago. VS: BP 160/95, HR 105, RR 22, Temp 98.8F. Assessment: Rule out acute MI. Plan: ECG, cardiac enzymes, aspirin 325mg, nitro SL PRN.",
    "Patient: Mary Johnson, DOB: 1968-07-22, MRN: 789012. Chief complaint: SOB and ankle swelling x 3 days. PMH: CHF, HTN, DM. Meds: Lisinopril 10mg daily, Metformin 500mg BID. VS: BP 145/88, HR 92, RR 24, O2 sat 88% RA. Plan: CXR, BNP, increase Lisinopril to 20mg daily.",
    "Patient: Robert Davis, DOB: 1982-11-08, MRN: 345678. Chief complaint: persistent cough and fever x 5 days. VS: BP 125/78, HR 88, RR 20, Temp 101.2F. Physical exam: crackles bilateral lower lobes. Assessment: Community acquired pneumonia. Plan: Chest X-ray, CBC, Azithromycin 500mg daily x 5 days."
]

# TODO: Create functions for:
# 1. extract_patient_demographics(note) - Extract name, DOB, MRN
# 2. extract_vital_signs(note) - Extract all vital signs
# 3. extract_medications(note) - Extract medication names, doses, frequencies
# 4. extract_chief_complaint(note) - Extract and normalize chief complaint
# 5. extract_assessment_and_plan(note) - Extract clinical assessment and treatment plan
# 6. generate_structured_summary(note) - Combine all extractions into structured format
# 7. batch_process_notes(notes_list) - Process multiple notes efficiently

# Your code here

---

## Summary

In this session, you learned:
- ✅ Advanced string manipulation for healthcare data
- ✅ Regular expressions for clinical text parsing
- ✅ Text cleaning and normalization techniques
- ✅ Clinical text analysis and statistics
- ✅ String performance optimization strategies
- ✅ Memory-efficient text processing patterns
- ✅ Essential skills for PySpark string functions

**Next:** Session 1.16 - APIs and JSON Data Processing