# Medical Image Processing Analysis

## Tier 2: AWS Cloud Processing Pipeline

This notebook analyzes the results of the medical image preprocessing pipeline using AWS services:
- **S3**: Stores raw and processed images
- **Lambda**: Preprocesses images (resize, normalize)
- **DynamoDB**: Stores metadata about processing

**Objectives:**
1. Query prediction metadata from DynamoDB
2. Analyze processing performance and statistics
3. Visualize image statistics and distributions
4. Understand AWS costs
5. Download and inspect sample processed images

## Setup

Import required libraries and configure AWS clients.

In [None]:
import json
import os
from datetime import datetime
from pathlib import Path

import boto3
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv
from IPython.display import display

# Configure plotting
plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")
%matplotlib inline

# Load environment variables
load_dotenv()

# Configure AWS clients
region = os.getenv("AWS_REGION", "us-east-1")
dynamodb = boto3.resource("dynamodb", region_name=region)
s3_client = boto3.client("s3", region_name=region)

print(f"AWS Region: {region}")
print(f"DynamoDB Region: {region}")
print(f"Boto3 version: {boto3.__version__}")

## 1. Query Prediction Metadata from DynamoDB

Retrieve all prediction records from DynamoDB table.

In [None]:
# Configuration
table_name = os.getenv("DYNAMODB_TABLE_NAME", "medical-predictions")
s3_bucket = os.getenv("S3_BUCKET_NAME", "")

print(f"DynamoDB Table: {table_name}")
print(f"S3 Bucket: {s3_bucket}")

# Connect to DynamoDB table
table = dynamodb.Table(table_name)

# Query all items
try:
    response = table.scan(Limit=200)
    items = response.get("Items", [])
    print(f"\nRetrieved {len(items)} records from DynamoDB")
except Exception as e:
    print(f"Error querying DynamoDB: {e}")
    items = []

In [None]:
# Parse and organize results
results = []

for item in items:
    record = {"image_id": item.get("image_id"), "timestamp": item.get("timestamp")}

    # Parse metadata JSON string
    metadata_str = item.get("metadata", "{}")
    if isinstance(metadata_str, str):
        try:
            metadata = json.loads(metadata_str)
        except:
            metadata = {}
    else:
        metadata = metadata_str

    record.update(metadata)
    results.append(record)

# Convert to DataFrame for easier analysis
df = pd.DataFrame(results)

print(f"\nResults DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print("\nFirst few records:")
display(df.head())

## 2. Processing Performance Analysis

Analyze Lambda function execution performance.

In [None]:
# Extract processing time metrics
if "processing_time_ms" in df.columns:
    processing_times = df["processing_time_ms"].dropna()

    print("Processing Time Statistics (milliseconds):")
    print(f"  Count:  {len(processing_times)}")
    print(f"  Min:    {processing_times.min():.2f}")
    print(f"  Max:    {processing_times.max():.2f}")
    print(f"  Mean:   {processing_times.mean():.2f}")
    print(f"  Median: {processing_times.median():.2f}")
    print(f"  Std:    {processing_times.std():.2f}")
else:
    print("No processing_time_ms data available")

In [None]:
# Visualize processing times
if "processing_time_ms" in df.columns and len(df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 4))

    # Histogram
    axes[0].hist(df["processing_time_ms"].dropna(), bins=20, edgecolor="black", alpha=0.7)
    axes[0].set_xlabel("Processing Time (ms)")
    axes[0].set_ylabel("Frequency")
    axes[0].set_title("Distribution of Image Processing Times")
    axes[0].grid(True, alpha=0.3)

    # Timeline
    axes[1].plot(
        range(len(df)), df["processing_time_ms"].dropna(), marker="o", linestyle="-", alpha=0.7
    )
    axes[1].set_xlabel("Image Index")
    axes[1].set_ylabel("Processing Time (ms)")
    axes[1].set_title("Processing Time Timeline")
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()
else:
    print("Not enough data for visualization")

## 3. Image Intensity Statistics

Analyze pixel intensity distributions in processed images.

In [None]:
# Extract intensity statistics
intensity_stats = {}

for col in ["min_value", "max_value", "mean_value", "std_value"]:
    if col in df.columns:
        data = df[col].dropna()
        intensity_stats[col] = {
            "count": len(data),
            "mean": data.mean(),
            "min": data.min(),
            "max": data.max(),
            "std": data.std(),
        }

print("Image Intensity Statistics:")
print("(Normalized to [0, 1] range)\n")

for metric, stats in intensity_stats.items():
    print(f"{metric}:")
    print(f"  Count:  {stats['count']}")
    print(f"  Mean:   {stats['mean']:.4f}")
    print(f"  Min:    {stats['min']:.4f}")
    print(f"  Max:    {stats['max']:.4f}")
    print(f"  Std:    {stats['std']:.4f}")
    print()

In [None]:
# Visualize intensity distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

intensity_cols = ["min_value", "max_value", "mean_value", "std_value"]
titles = ["Min Intensity", "Max Intensity", "Mean Intensity", "Std Intensity"]

for idx, (col, title) in enumerate(zip(intensity_cols, titles)):
    ax = axes[idx // 2, idx % 2]

    if col in df.columns:
        data = df[col].dropna()
        ax.hist(data, bins=20, edgecolor="black", alpha=0.7, color="steelblue")
        ax.set_xlabel(title)
        ax.set_ylabel("Frequency")
        ax.set_title(f"Distribution of {title}")
        ax.grid(True, alpha=0.3)

        # Add statistics text
        stats_text = f"Mean: {data.mean():.4f}\nStd: {data.std():.4f}"
        ax.text(
            0.98,
            0.97,
            stats_text,
            transform=ax.transAxes,
            verticalalignment="top",
            horizontalalignment="right",
            bbox={"boxstyle": "round", "facecolor": "wheat", "alpha": 0.8},
        )

plt.tight_layout()
plt.show()

## 4. Data Size Analysis

Analyze file sizes and data storage requirements.

In [None]:
# File size analysis
if "source_size" in df.columns and "output_size" in df.columns:
    source_sizes = df["source_size"].dropna()
    output_sizes = df["output_size"].dropna()

    print("Source Image Sizes:")
    print(f"  Total files: {len(source_sizes)}")
    print(f"  Total bytes: {source_sizes.sum():,.0f} ({source_sizes.sum() / 1e6:.2f} MB)")
    print(f"  Mean size:   {source_sizes.mean():,.0f} bytes ({source_sizes.mean() / 1e6:.4f} MB)")
    print(f"  Min size:    {source_sizes.min():,.0f} bytes")
    print(f"  Max size:    {source_sizes.max():,.0f} bytes")

    print("\nProcessed Image Sizes:")
    print(f"  Total bytes: {output_sizes.sum():,.0f} ({output_sizes.sum() / 1e6:.2f} MB)")
    print(f"  Mean size:   {output_sizes.mean():,.0f} bytes ({output_sizes.mean() / 1e6:.4f} MB)")
    print(f"  Min size:    {output_sizes.min():,.0f} bytes")
    print(f"  Max size:    {output_sizes.max():,.0f} bytes")

    # Compression ratio
    if len(source_sizes) == len(output_sizes):
        compression_ratio = (1 - output_sizes.sum() / source_sizes.sum()) * 100
        print(f"\nCompression ratio: {compression_ratio:.1f}%")
else:
    print("File size data not available")

In [None]:
# Visualize file sizes
if "source_size" in df.columns and "output_size" in df.columns and len(df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # File size comparison
    size_data = pd.DataFrame(
        {
            "Source": df["source_size"].dropna() / 1e6,  # Convert to MB
            "Processed": df["output_size"].dropna() / 1e6,
        }
    )

    size_data.plot(kind="box", ax=axes[0])
    axes[0].set_ylabel("File Size (MB)")
    axes[0].set_title("File Size Comparison: Source vs Processed")
    axes[0].grid(True, alpha=0.3)

    # Storage requirements
    if "source_size" in df.columns:
        total_source = df["source_size"].dropna().sum() / 1e9  # Convert to GB
    else:
        total_source = 0

    if "output_size" in df.columns:
        total_output = df["output_size"].dropna().sum() / 1e9
    else:
        total_output = 0

    storage_data = [total_source, total_output]
    storage_labels = ["Source Images", "Processed Images"]

    axes[1].bar(
        storage_labels, storage_data, color=["steelblue", "coral"], alpha=0.7, edgecolor="black"
    )
    axes[1].set_ylabel("Total Storage (GB)")
    axes[1].set_title("Total Storage Requirements")
    axes[1].grid(True, alpha=0.3, axis="y")

    plt.tight_layout()
    plt.show()
else:
    print("Not enough data for visualization")

## 5. Cost Analysis

Estimate AWS costs based on actual usage.

In [None]:
# Cost calculation
print("AWS Cost Estimate (us-east-1 pricing)\n")
print("=" * 60)

# S3 Costs
total_images = len(df)
if "source_size" in df.columns:
    total_source_gb = df["source_size"].dropna().sum() / 1e9
else:
    total_source_gb = 0

if "output_size" in df.columns:
    total_output_gb = df["output_size"].dropna().sum() / 1e9
else:
    total_output_gb = 0

s3_storage_cost = (total_source_gb + total_output_gb) * 0.023 * 7 / 30  # 7 days storage
s3_upload_cost = total_images * 0.005 / 1000  # $0.005 per 1000 PUT
s3_download_cost = (total_images * 0.4) * 0.0004 / 1000  # Estimate 40% retrieval

print("S3 Storage (7 days):")
print(f"  Raw images: {total_source_gb:.3f} GB")
print(f"  Processed:  {total_output_gb:.3f} GB")
print(f"  Cost:       ${s3_storage_cost:.4f}")
print("\nS3 Requests:")
print(f"  Uploads:    {total_images} PUT requests = ${s3_upload_cost:.4f}")
print(f"  Downloads:  {total_images * 0.4:.0f} GET requests = ${s3_download_cost:.4f}")

# Lambda Costs
if "processing_time_ms" in df.columns:
    total_lambda_ms = df["processing_time_ms"].dropna().sum()
    total_lambda_gb_seconds = (total_lambda_ms / 1000) * 0.25 / 1024  # 256 MB = 0.25 GB
else:
    total_lambda_gb_seconds = total_images * 0.030  # Estimate 30ms per image

lambda_compute_cost = total_lambda_gb_seconds * 0.0000166667  # $0.0000166667 per GB-second
lambda_request_cost = total_images * 0.0000002  # $0.0000002 per request

print("\nLambda:")
print(f"  Invocations: {total_images} @ $0.0000002 = ${lambda_request_cost:.4f}")
print(f"  Compute:     {total_lambda_gb_seconds:.2f} GB-seconds = ${lambda_compute_cost:.4f}")

# DynamoDB Costs
dynamodb_write_units = total_images  # 1 write per image
dynamodb_cost = dynamodb_write_units * 1.25 / 1e6  # $1.25 per 1M writes (on-demand)

print("\nDynamoDB (on-demand):")
print(f"  Writes:      {dynamodb_write_units} items = ${dynamodb_cost:.4f}")

# Total
total_cost = (
    s3_storage_cost
    + s3_upload_cost
    + s3_download_cost
    + lambda_compute_cost
    + lambda_request_cost
    + dynamodb_cost
)

print(f"\n{'=' * 60}")
print(f"TOTAL ESTIMATED COST: ${total_cost:.4f}")
print(f"Cost per image:       ${total_cost / max(total_images, 1):.4f}")
print(f"{'=' * 60}")

## 6. Download and Inspect Sample Images

Retrieve sample processed images from S3 for visual inspection.

In [None]:
# Download sample processed images
sample_dir = Path("./sample_images")
sample_dir.mkdir(exist_ok=True)

downloaded = 0
max_samples = 3

if s3_bucket:
    for idx, row in df.head(max_samples).iterrows():
        output_key = row.get("output_key")

        if output_key:
            local_file = sample_dir / Path(output_key).name

            try:
                print(f"Downloading: {output_key}")
                s3_client.download_file(s3_bucket, output_key, str(local_file))
                downloaded += 1
            except Exception as e:
                print(f"  Error: {e}")

    print(f"\nDownloaded {downloaded} sample images")
else:
    print("S3 bucket not configured")

In [None]:
# Display sample images

from PIL import Image as PILImage

image_files = list(Path(sample_dir).glob("*.png"))

if image_files:
    n_images = min(len(image_files), 3)
    fig, axes = plt.subplots(1, n_images, figsize=(15, 5))

    if n_images == 1:
        axes = [axes]

    for ax, img_file in zip(axes, image_files[:n_images]):
        img = PILImage.open(img_file)
        ax.imshow(img, cmap="gray")
        ax.set_title(img_file.name)
        ax.axis("off")

    plt.tight_layout()
    plt.show()
else:
    print("No sample images available")

## 7. Summary and Recommendations

Key findings and next steps.

In [None]:
print("\n" + "=" * 70)
print("MEDICAL IMAGE PROCESSING - TIER 2 PROJECT SUMMARY")
print("=" * 70)

print(f"\nProject Completion Date: {datetime.utcnow().isoformat()}")

print("\nKEY METRICS:")
print(f"  Total images processed: {len(df)}")

if "processing_time_ms" in df.columns and len(df) > 0:
    print(f"  Average processing time: {df['processing_time_ms'].mean():.2f} ms")

if "source_size" in df.columns and len(df) > 0:
    print(f"  Total data processed: {df['source_size'].sum() / 1e6:.2f} MB")

print(f"  Estimated cost: ${total_cost:.2f}")

print("\nWHAT YOU LEARNED:")
print("  ✓ How to use AWS S3 for cloud storage")
print("  ✓ How to deploy and invoke AWS Lambda functions")
print("  ✓ How to store metadata in NoSQL databases (DynamoDB)")
print("  ✓ How to query cloud data with boto3")
print("  ✓ How AWS pricing works for serverless architectures")

print("\nNEXT STEPS:")
print("  1. Clean up AWS resources using cleanup_guide.md")
print("  2. Review Lambda logs in CloudWatch")
print("  3. Check AWS Cost Explorer for actual costs")
print("  4. Consider Tier 3 for production infrastructure")
print("  5. Explore advanced features:")
print("     - Model inference in Lambda")
print("     - S3 event notifications for real-time processing")
print("     - CloudWatch monitoring and alerts")

print("\nRESOURCES:")
print("  - setup_guide.md: AWS setup instructions")
print("  - cleanup_guide.md: How to delete resources")
print("  - scripts/: Python scripts for data pipeline")

print("\n" + "=" * 70)