# Astronomical Image Processing - Tier 2 Analysis

This notebook demonstrates the complete workflow for astronomical source detection using AWS services.

**Workflow:**
1. Upload FITS images to S3
2. Invoke Lambda for source detection
3. Query results with Athena (SQL)
4. Visualize and analyze findings

## Setup and Configuration

In [None]:
# Import required libraries
import os
import time
from pathlib import Path

import boto3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from botocore.exceptions import NoCredentialsError

# Set up plotting style
%matplotlib inline
plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")

print("✓ Libraries imported successfully")

In [None]:
# Load environment variables
env_file = Path.home() / ".astronomy_env"
if env_file.exists():
    with open(env_file) as f:
        for line in f:
            if line.startswith("export "):
                key_value = line.replace("export ", "").strip()
                if "=" in key_value:
                    key, value = key_value.split("=", 1)
                    os.environ[key] = value

# Configuration
BUCKET_RAW = os.environ.get("BUCKET_RAW", "")
BUCKET_CATALOG = os.environ.get("BUCKET_CATALOG", "")
LAMBDA_FUNCTION = os.environ.get("LAMBDA_FUNCTION", "astronomy-source-detection")
ATHENA_WORKGROUP = os.environ.get("ATHENA_WORKGROUP", "astronomy-workgroup")
AWS_REGION = os.environ.get("AWS_REGION", "us-east-1")

print("Configuration:")
print(f"  Raw Bucket: {BUCKET_RAW}")
print(f"  Catalog Bucket: {BUCKET_CATALOG}")
print(f"  Lambda Function: {LAMBDA_FUNCTION}")
print(f"  AWS Region: {AWS_REGION}")

if not BUCKET_RAW or not BUCKET_CATALOG:
    print("\n⚠ Warning: Some environment variables not set!")
    print("Run setup_guide.md to configure AWS resources")

In [None]:
# Initialize AWS clients
try:
    s3 = boto3.client("s3", region_name=AWS_REGION)
    lambda_client = boto3.client("lambda", region_name=AWS_REGION)
    athena = boto3.client("athena", region_name=AWS_REGION)

    # Test credentials
    sts = boto3.client("sts")
    identity = sts.get_caller_identity()

    print("✓ AWS Authentication Successful")
    print(f"  Account: {identity['Account']}")
    print(f"  User: {identity['Arn']}")
except NoCredentialsError:
    print("✗ AWS credentials not configured")
    print("Run: aws configure")

## Section 1: Check Uploaded Images

In [None]:
# List uploaded FITS images
def list_s3_objects(bucket, prefix):
    """List objects in S3 bucket."""
    objects = []
    paginator = s3.get_paginator("list_objects_v2")

    try:
        for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
            if "Contents" in page:
                for obj in page["Contents"]:
                    objects.append(
                        {
                            "Key": obj["Key"],
                            "Size": obj["Size"],
                            "LastModified": obj["LastModified"],
                        }
                    )
    except Exception as e:
        print(f"Error: {e}")

    return objects


# Check images
print(f"Images in s3://{BUCKET_RAW}/images/:\n")
images = list_s3_objects(BUCKET_RAW, "images/")

if images:
    df_images = pd.DataFrame(images)
    df_images["Size_MB"] = df_images["Size"] / 1024 / 1024
    print(df_images[["Key", "Size_MB", "LastModified"]].to_string())
    print(f"\nTotal: {len(images)} images, {df_images['Size_MB'].sum():.1f} MB")
else:
    print("No images found. Run: python scripts/upload_to_s3.py")

## Section 2: Source Detection Results

In [None]:
# List source detection results
print(f"Source catalogs in s3://{BUCKET_CATALOG}/sources/:\n")
catalogs = list_s3_objects(BUCKET_CATALOG, "sources/")

if catalogs:
    df_catalogs = pd.DataFrame(catalogs)
    df_catalogs["Size_KB"] = df_catalogs["Size"] / 1024
    print(df_catalogs[["Key", "Size_KB", "LastModified"]].to_string())
    print(f"\nTotal: {len(catalogs)} catalogs")
else:
    print("No catalogs found. Run: python scripts/invoke_lambda.py")

## Section 3: Athena Queries

In [None]:
def execute_athena_query(query_string, database="astronomy"):
    """Execute Athena query and return results as DataFrame."""

    # Start query
    response = athena.start_query_execution(
        QueryString=query_string,
        QueryExecutionContext={"Database": database},
        ResultConfiguration={"OutputLocation": f"s3://{BUCKET_CATALOG}/athena-results/"},
        WorkGroup=ATHENA_WORKGROUP,
    )

    query_id = response["QueryExecutionId"]

    # Wait for completion
    for _i in range(60):
        result = athena.get_query_execution(QueryExecutionId=query_id)
        status = result["QueryExecution"]["Status"]["State"]

        if status == "SUCCEEDED":
            break
        elif status == "FAILED":
            print(f"Query failed: {result['QueryExecution']['Status']['StateChangeReason']}")
            return None

        print(f"  Status: {status}...", end="\r")
        time.sleep(1)

    # Get results
    results = athena.get_query_results(QueryExecutionId=query_id)

    # Convert to DataFrame
    rows = results["ResultSet"]["Rows"]
    headers = [col.get("VarCharValue", "") for col in rows[0]["Data"]]
    data = []

    for row in rows[1:]:
        values = [col.get("VarCharValue", "") for col in row["Data"]]
        data.append(values)

    return pd.DataFrame(data, columns=headers)


print("Athena query function defined.")

In [None]:
# Query 1: Total sources
print("\n=== Total Sources ===")
try:
    query = "SELECT COUNT(*) as total_sources FROM astronomy.sources;"
    df = execute_athena_query(query)
    print(df.to_string(index=False))
except Exception as e:
    print(f"Query error: {e}")
    print("Note: Athena table may not exist yet. Run: python scripts/invoke_lambda.py")

In [None]:
# Query 2: Source statistics
print("\n=== Source Statistics ===")
try:
    query = """
    SELECT
      COUNT(*) as total_sources,
      ROUND(AVG(CAST(flux AS DOUBLE)), 2) as mean_flux,
      ROUND(MAX(CAST(flux AS DOUBLE)), 2) as max_flux,
      ROUND(AVG(CAST(snr AS DOUBLE)), 2) as mean_snr,
      ROUND(MAX(CAST(snr AS DOUBLE)), 2) as max_snr
    FROM astronomy.sources;
    """
    df = execute_athena_query(query)
    print(df.to_string(index=False))
except Exception as e:
    print(f"Query error: {e}")

In [None]:
# Query 3: Bright sources
print("\n=== Brightest 10 Sources ===")
try:
    query = """
    SELECT
      ROUND(CAST(ra AS DOUBLE), 4) as ra,
      ROUND(CAST(dec AS DOUBLE), 4) as dec,
      ROUND(CAST(flux AS DOUBLE), 1) as flux,
      ROUND(CAST(snr AS DOUBLE), 1) as snr
    FROM astronomy.sources
    WHERE CAST(snr AS DOUBLE) > 20
    ORDER BY CAST(flux AS DOUBLE) DESC
    LIMIT 10;
    """
    df = execute_athena_query(query)
    print(df.to_string(index=False))
except Exception as e:
    print(f"Query error: {e}")

## Section 4: Data Analysis and Visualization

In [None]:
# Load all source data for visualization
print("Loading source data...")
try:
    query = """
    SELECT
      image_id,
      source_id,
      CAST(ra AS DOUBLE) as ra,
      CAST(dec AS DOUBLE) as dec,
      CAST(x AS DOUBLE) as x,
      CAST(y AS DOUBLE) as y,
      CAST(flux AS DOUBLE) as flux,
      CAST(peak AS DOUBLE) as peak,
      CAST(fwhm AS DOUBLE) as fwhm,
      CAST(snr AS DOUBLE) as snr
    FROM astronomy.sources
    ORDER BY flux DESC
    LIMIT 1000;
    """
    df_sources = execute_athena_query(query)
    print(f"✓ Loaded {len(df_sources)} sources")
    print(df_sources.head(10).to_string())
except Exception as e:
    print(f"Error: {e}")
    df_sources = None

In [None]:
# Flux distribution
if df_sources is not None:
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))

    # Histogram of flux
    axes[0, 0].hist(df_sources["flux"], bins=50, edgecolor="black")
    axes[0, 0].set_xlabel("Flux (μJy)")
    axes[0, 0].set_ylabel("Count")
    axes[0, 0].set_title("Flux Distribution")
    axes[0, 0].set_yscale("log")

    # SNR distribution
    axes[0, 1].hist(df_sources["snr"], bins=50, edgecolor="black", color="orange")
    axes[0, 1].set_xlabel("Signal-to-Noise Ratio")
    axes[0, 1].set_ylabel("Count")
    axes[0, 1].set_title("SNR Distribution")

    # Flux vs SNR
    axes[1, 0].scatter(df_sources["flux"], df_sources["snr"], alpha=0.5, s=20)
    axes[1, 0].set_xlabel("Flux (μJy)")
    axes[1, 0].set_ylabel("Signal-to-Noise Ratio")
    axes[1, 0].set_title("Flux vs SNR")
    axes[1, 0].set_xscale("log")

    # FWHM distribution
    axes[1, 1].hist(df_sources["fwhm"], bins=40, edgecolor="black", color="green")
    axes[1, 1].set_xlabel("FWHM (pixels)")
    axes[1, 1].set_ylabel("Count")
    axes[1, 1].set_title("FWHM Distribution")

    plt.tight_layout()
    plt.savefig("source_distributions.png", dpi=100, bbox_inches="tight")
    plt.show()
    print("✓ Saved: source_distributions.png")

In [None]:
# Sky map - spatial distribution
if df_sources is not None and len(df_sources) > 0:
    fig, ax = plt.subplots(figsize=(12, 8))

    scatter = ax.scatter(
        df_sources["ra"],
        df_sources["dec"],
        c=np.log10(df_sources["flux"]),
        s=df_sources["snr"] * 2,
        alpha=0.6,
        cmap="viridis",
    )

    ax.set_xlabel("RA (degrees)")
    ax.set_ylabel("Dec (degrees)")
    ax.set_title("Astronomical Source Distribution")

    cbar = plt.colorbar(scatter, ax=ax, label="log10(Flux)")

    plt.tight_layout()
    plt.savefig("sky_distribution.png", dpi=100, bbox_inches="tight")
    plt.show()
    print("✓ Saved: sky_distribution.png")

## Section 5: Summary Statistics

In [None]:
# Summary
if df_sources is not None:
    print("\n" + "=" * 60)
    print("ANALYSIS SUMMARY")
    print("=" * 60)
    print(f"\nTotal sources analyzed: {len(df_sources)}")
    print("\nFlux Statistics:")
    print(f"  Mean: {df_sources['flux'].astype(float).mean():.1f} μJy")
    print(f"  Median: {df_sources['flux'].astype(float).median():.1f} μJy")
    print(f"  Min: {df_sources['flux'].astype(float).min():.1f} μJy")
    print(f"  Max: {df_sources['flux'].astype(float).max():.1f} μJy")

    print("\nSNR Statistics:")
    print(f"  Mean: {df_sources['snr'].astype(float).mean():.1f}")
    print(f"  Median: {df_sources['snr'].astype(float).median():.1f}")
    print(f"  Min: {df_sources['snr'].astype(float).min():.1f}")
    print(f"  Max: {df_sources['snr'].astype(float).max():.1f}")

    print("\nSpatial Statistics:")
    print(
        f"  RA range: {df_sources['ra'].astype(float).min():.2f} - {df_sources['ra'].astype(float).max():.2f}°"
    )
    print(
        f"  Dec range: {df_sources['dec'].astype(float).min():.2f} - {df_sources['dec'].astype(float).max():.2f}°"
    )

print("\n" + "=" * 60)
print("Project Complete!")
print("=" * 60)
print("\nNext steps:")
print("  1. Export results: df_sources.to_csv('sources.csv')")
print("  2. When finished, run cleanup_guide.md to delete AWS resources")
print("  3. Check costs in AWS Cost Explorer")