# Ocean Data Analysis - AWS Tier 2

Analyze oceanographic observations stored in AWS DynamoDB.

This notebook demonstrates:
1. Generating sample ocean data
2. Uploading to S3 (triggers Lambda processing)
3. Querying results from DynamoDB
4. Visualizing ocean parameters
5. Analyzing marine anomalies

## 1. Setup and Configuration

In [None]:
import os
from datetime import datetime, timedelta
from decimal import Decimal

import boto3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

print("✓ Libraries imported successfully")

In [None]:
# AWS Configuration
BUCKET_NAME = os.environ.get("BUCKET_NAME", "ocean-data-YOUR-NAME")  # Replace with your bucket
DYNAMODB_TABLE = os.environ.get("DYNAMODB_TABLE", "OceanObservations")
AWS_REGION = os.environ.get("AWS_REGION", "us-east-1")

print("AWS Configuration:")
print(f"  Bucket: {BUCKET_NAME}")
print(f"  DynamoDB Table: {DYNAMODB_TABLE}")
print(f"  Region: {AWS_REGION}")

# Initialize AWS clients
s3_client = boto3.client("s3", region_name=AWS_REGION)
dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)

print("\n✓ AWS clients initialized")

## 2. Generate and Upload Sample Ocean Data

In [None]:
def generate_ocean_data(num_profiles=5, depths=15):
    """
    Generate sample oceanographic data.
    """
    data = []

    locations = [
        {"name": "Gulf Stream", "lat": 40.5, "lon": -70.2},
        {"name": "Sargasso Sea", "lat": 32.0, "lon": -64.0},
        {"name": "Labrador Sea", "lat": 58.0, "lon": -52.0},
        {"name": "Station ALOHA", "lat": 22.75, "lon": -158.0},
    ]

    base_time = datetime.utcnow() - timedelta(days=7)

    for i in range(num_profiles):
        location = locations[i % len(locations)]
        profile_time = base_time + timedelta(days=i)

        depth_levels = np.linspace(0, 400, depths)

        for depth in depth_levels:
            # Temperature profile
            surface_temp = 20.0 + np.random.normal(0, 2)
            deep_temp = 4.0 + np.random.normal(0, 0.5)
            temperature = surface_temp * np.exp(-depth / 100) + deep_temp * (
                1 - np.exp(-depth / 100)
            )

            # Add marine heatwave to Gulf Stream
            if location["name"] == "Gulf Stream" and depth < 50:
                temperature += 3.8

            # Salinity profile
            salinity = 35.0 + depth / 200 + np.random.normal(0, 0.2)

            # pH profile
            ph = 8.1 - depth / 1000 + np.random.normal(0, 0.05)
            if location["name"] == "Sargasso Sea":
                ph -= 0.4  # Acidification

            # Dissolved oxygen
            do = 8.0 * np.exp(-depth / 150) + 2.0 + np.random.normal(0, 0.3)
            if location["name"] == "Station ALOHA" and 200 < depth < 350:
                do = max(1.2, do - 2.5)  # Oxygen minimum zone

            # Chlorophyll
            chlorophyll = max(0.1, 5.0 * np.exp(-depth / 50) + np.random.normal(0, 0.5))
            if location["name"] == "Labrador Sea" and depth < 30:
                chlorophyll = 28.0  # Spring bloom

            data.append(
                {
                    "timestamp": profile_time.isoformat() + "Z",
                    "location_name": location["name"],
                    "latitude": location["lat"],
                    "longitude": location["lon"],
                    "depth": round(depth, 1),
                    "temperature": round(temperature, 2),
                    "salinity": round(salinity, 2),
                    "ph": round(ph, 3),
                    "dissolved_oxygen": round(max(0.1, do), 2),
                    "chlorophyll": round(max(0.1, chlorophyll), 2),
                }
            )

    return pd.DataFrame(data)


# Generate sample data
df_sample = generate_ocean_data(num_profiles=5, depths=15)
print(f"Generated {len(df_sample)} ocean observations")
print("\nSample data:")
df_sample.head(10)

In [None]:
# Save to CSV
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
csv_filename = f"ocean_data_{timestamp}.csv"
df_sample.to_csv(csv_filename, index=False)
print(f"Saved to {csv_filename}")

# Upload to S3 (this will trigger Lambda processing)
s3_key = f"raw/{csv_filename}"
print(f"\nUploading to S3: s3://{BUCKET_NAME}/{s3_key}")

try:
    s3_client.upload_file(csv_filename, BUCKET_NAME, s3_key)
    print("✓ Upload successful!")
    print("\nLambda function will process this file automatically.")
    print("Wait 10-15 seconds for processing to complete...")
except Exception as e:
    print(f"✗ Upload failed: {e}")

In [None]:
# Wait for Lambda processing
import time

print("Waiting for Lambda processing...")
time.sleep(15)
print("✓ Processing should be complete")

## 3. Query Results from DynamoDB

In [None]:
def query_dynamodb(table_name, days=7):
    """
    Query all observations from DynamoDB.
    """
    table = dynamodb.Table(table_name)

    response = table.scan()
    items = response["Items"]

    # Continue scanning if there are more items
    while "LastEvaluatedKey" in response:
        response = table.scan(ExclusiveStartKey=response["LastEvaluatedKey"])
        items.extend(response["Items"])

    # Convert Decimal to float
    for item in items:
        for key, value in item.items():
            if isinstance(value, Decimal):
                item[key] = float(value)

    return items


# Query observations
print("Querying DynamoDB...")
observations = query_dynamodb(DYNAMODB_TABLE)
print(f"Retrieved {len(observations)} observations")

# Convert to DataFrame
df = pd.DataFrame(observations)

if not df.empty:
    df = df.sort_values("timestamp")
    print(f"\nDataFrame shape: {df.shape}")
    print(f"\nColumns: {list(df.columns)}")
    df.head()

## 4. Summary Statistics

In [None]:
if not df.empty:
    print("OCEAN OBSERVATIONS SUMMARY")
    print("=" * 80)

    print(f"\nTotal Observations: {len(df)}")

    print("\nLocations:")
    for location, count in df["location_name"].value_counts().items():
        print(f"  - {location}: {count}")

    print("\nAnomaly Status:")
    for status, count in df["anomaly_status"].value_counts().items():
        print(f"  - {status}: {count}")

    print("\nParameter Statistics:")
    params = ["temperature", "salinity", "ph", "dissolved_oxygen", "chlorophyll"]
    print(df[params].describe())
else:
    print("No data available yet. Make sure Lambda processing completed.")

## 5. Depth Profiles

In [None]:
if not df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    # Temperature profile
    for location in df["location_name"].unique():
        data = df[df["location_name"] == location]
        axes[0, 0].plot(data["temperature"], data["depth"], marker="o", label=location)
    axes[0, 0].invert_yaxis()
    axes[0, 0].set_xlabel("Temperature (°C)")
    axes[0, 0].set_ylabel("Depth (m)")
    axes[0, 0].set_title("Temperature Profiles")
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)

    # Salinity profile
    for location in df["location_name"].unique():
        data = df[df["location_name"] == location]
        axes[0, 1].plot(data["salinity"], data["depth"], marker="o", label=location)
    axes[0, 1].invert_yaxis()
    axes[0, 1].set_xlabel("Salinity (PSU)")
    axes[0, 1].set_ylabel("Depth (m)")
    axes[0, 1].set_title("Salinity Profiles")
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)

    # Dissolved oxygen profile
    for location in df["location_name"].unique():
        data = df[df["location_name"] == location]
        axes[1, 0].plot(data["dissolved_oxygen"], data["depth"], marker="o", label=location)
    axes[1, 0].invert_yaxis()
    axes[1, 0].set_xlabel("Dissolved Oxygen (mg/L)")
    axes[1, 0].set_ylabel("Depth (m)")
    axes[1, 0].set_title("Dissolved Oxygen Profiles")
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)

    # Chlorophyll profile
    for location in df["location_name"].unique():
        data = df[df["location_name"] == location]
        axes[1, 1].plot(data["chlorophyll"], data["depth"], marker="o", label=location)
    axes[1, 1].invert_yaxis()
    axes[1, 1].set_xlabel("Chlorophyll-a (mg/m³)")
    axes[1, 1].set_ylabel("Depth (m)")
    axes[1, 1].set_title("Chlorophyll Profiles")
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig("ocean_depth_profiles.png", dpi=300, bbox_inches="tight")
    print("✓ Saved figure: ocean_depth_profiles.png")
    plt.show()

## 6. Temperature-Salinity (T-S) Diagram

In [None]:
if not df.empty:
    plt.figure(figsize=(10, 8))

    for location in df["location_name"].unique():
        data = df[df["location_name"] == location]
        plt.scatter(data["salinity"], data["temperature"], label=location, alpha=0.6, s=50)

    plt.xlabel("Salinity (PSU)", fontsize=12)
    plt.ylabel("Temperature (°C)", fontsize=12)
    plt.title("Temperature-Salinity Diagram", fontsize=14)
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig("ts_diagram.png", dpi=300, bbox_inches="tight")
    print("✓ Saved figure: ts_diagram.png")
    plt.show()

## 7. Anomaly Analysis

In [None]:
if not df.empty:
    # Filter anomalies
    anomalies = df[df["anomaly_status"].isin(["warning", "critical"])]

    print(f"\nMARINE ANOMALIES DETECTED: {len(anomalies)}")
    print("=" * 80)

    if not anomalies.empty:
        for _idx, row in anomalies.iterrows():
            print(f"\nLocation: {row['location_name']}")
            print(f"Depth: {row['depth']:.0f}m")
            print(f"Status: {row['anomaly_status'].upper()}")
            print(f"Type: {row['anomaly_type']}")
            print(
                f"Temperature: {row['temperature']:.2f}°C (anomaly: {row['temperature_anomaly']:.2f}°C)"
            )
            print(f"pH: {row['ph']:.3f}")
            print(f"DO: {row['dissolved_oxygen']:.2f} mg/L")
            print(f"Chlorophyll: {row['chlorophyll']:.2f} mg/m³")
            print("-" * 80)
    else:
        print("No anomalies detected.")

## 8. Ocean Acidification Analysis

In [None]:
if not df.empty:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # pH vs Depth
    for location in df["location_name"].unique():
        data = df[df["location_name"] == location]
        axes[0].plot(data["ph"], data["depth"], marker="o", label=location)

    axes[0].axvline(x=8.1, color="green", linestyle="--", label="Normal pH", alpha=0.5)
    axes[0].axvline(x=7.8, color="orange", linestyle="--", label="Warning", alpha=0.5)
    axes[0].axvline(x=7.6, color="red", linestyle="--", label="Critical", alpha=0.5)
    axes[0].invert_yaxis()
    axes[0].set_xlabel("pH")
    axes[0].set_ylabel("Depth (m)")
    axes[0].set_title("pH Profile with Thresholds")
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)

    # Aragonite Saturation
    for location in df["location_name"].unique():
        data = df[df["location_name"] == location]
        axes[1].plot(data["aragonite_saturation"], data["depth"], marker="o", label=location)

    axes[1].axvline(x=1.0, color="red", linestyle="--", label="Undersaturated", alpha=0.5)
    axes[1].invert_yaxis()
    axes[1].set_xlabel("Aragonite Saturation (Ωarag)")
    axes[1].set_ylabel("Depth (m)")
    axes[1].set_title("Aragonite Saturation State")
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig("ocean_acidification.png", dpi=300, bbox_inches="tight")
    print("✓ Saved figure: ocean_acidification.png")
    plt.show()

## 9. Primary Production Analysis

In [None]:
if not df.empty:
    # Surface observations only (< 50m)
    surface = df[df["depth"] < 50]

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    surface.boxplot(column="chlorophyll", by="location_name", ax=plt.gca())
    plt.title("Surface Chlorophyll by Location")
    plt.suptitle("")
    plt.xlabel("Location")
    plt.ylabel("Chlorophyll-a (mg/m³)")
    plt.xticks(rotation=45)

    plt.subplot(1, 2, 2)
    surface.boxplot(column="primary_production", by="location_name", ax=plt.gca())
    plt.title("Primary Production by Location")
    plt.suptitle("")
    plt.xlabel("Location")
    plt.ylabel("Primary Production (mg C/m²/day)")
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.savefig("primary_production.png", dpi=300, bbox_inches="tight")
    print("✓ Saved figure: primary_production.png")
    plt.show()

## 10. Export Results

In [None]:
if not df.empty:
    # Export full dataset
    export_filename = f"ocean_observations_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.csv"
    df.to_csv(export_filename, index=False)
    print(f"✓ Exported {len(df)} observations to {export_filename}")

    # Export anomalies only
    if not anomalies.empty:
        anomaly_filename = f"marine_anomalies_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.csv"
        anomalies.to_csv(anomaly_filename, index=False)
        print(f"✓ Exported {len(anomalies)} anomalies to {anomaly_filename}")

## Summary

This notebook demonstrated:
- ✓ Generating realistic oceanographic data
- ✓ Uploading to S3 (triggering Lambda processing)
- ✓ Querying processed results from DynamoDB
- ✓ Visualizing ocean depth profiles
- ✓ T-S diagram analysis
- ✓ Marine anomaly detection (heatwaves, acidification, hypoxia, blooms)
- ✓ Ocean acidification metrics
- ✓ Primary production analysis

### Next Steps:
1. Upload your own ocean data
2. Modify Lambda thresholds for your region
3. Add more parameters (nutrients, currents)
4. Implement time series analysis
5. Create spatial maps using cartopy
6. Move to Tier 3 for production deployment