# Climate Data Analysis with AWS S3 and Lambda
## Tier 2 Project - Analyze processed climate data

This notebook demonstrates how to:
1. Download processed results from S3
2. Analyze climate statistics
3. Create visualizations
4. Export reports

## Setup

First, let's import necessary libraries and configure AWS access.

In [None]:
import os
import sys
import json
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configure matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("✓ Libraries imported successfully")

## Configuration

Set up S3 connection parameters

In [None]:
# Get configuration from environment
AWS_REGION = os.environ.get('AWS_REGION', 'us-east-1')
BUCKET_NAME = os.environ.get('AWS_S3_BUCKET', 'climate-data-unknown')
LAMBDA_FUNCTION = os.environ.get('AWS_LAMBDA_FUNCTION', 'process-climate-data')

print(f"AWS Region: {AWS_REGION}")
print(f"S3 Bucket: {BUCKET_NAME}")
print(f"Lambda Function: {LAMBDA_FUNCTION}")

# Create S3 client
s3 = boto3.client('s3', region_name=AWS_REGION)
print("\n✓ AWS S3 client configured")

## Step 1: List Available Results

Check what results are available in S3

In [None]:
def list_s3_files(bucket, prefix='results/'):
    """List files in S3 bucket."""
    try:
        response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
        
        if 'Contents' not in response:
            print(f"No files found in {prefix}")
            return []
        
        files = []
        for obj in response['Contents']:
            files.append({
                'key': obj['Key'],
                'size_mb': obj['Size'] / 1e6,
                'modified': obj['LastModified']
            })
        
        return files
    except Exception as e:
        print(f"Error listing files: {e}")
        return []

# List results
results_files = list_s3_files(BUCKET_NAME, 'results/')

print(f"Found {len(results_files)} result files:\n")
for f in results_files[:10]:  # Show first 10
    print(f"  {f['key']} ({f['size_mb']:.2f}MB)")

if len(results_files) > 10:
    print(f"  ... and {len(results_files) - 10} more")

## Step 2: Download Results

Download processed results from S3 to local machine

In [None]:
def download_s3_results(bucket, prefix='results/', output_dir='./results'):
    """Download all results from S3."""
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    files = list_s3_files(bucket, prefix)
    downloaded = []
    
    for f in files:
        try:
            output_path = output_dir / Path(f['key']).name
            print(f"Downloading: {f['key']}")
            
            s3.download_file(bucket, f['key'], str(output_path))
            downloaded.append(str(output_path))
            print(f"  ✓ Saved to {output_path}")
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\n✓ Downloaded {len(downloaded)} files")
    return downloaded

# Download results
downloaded_files = download_s3_results(BUCKET_NAME)

if not downloaded_files:
    print("\n⚠️  No results to download. Make sure to run setup_guide.md first!")

## Step 3: Parse and Analyze Results

Load and analyze the processed climate data

In [None]:
def read_json_files(file_paths):
    """Read JSON result files."""
    data = []
    
    for file_path in file_paths:
        try:
            with open(file_path, 'r') as f:
                data.append(json.load(f))
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    
    return data

# Load results
results_data = read_json_files(downloaded_files)
print(f"Loaded {len(results_data)} result files\n")

# Display first result
if results_data:
    print("Sample result file:")
    print(json.dumps(results_data[0], indent=2)[:500] + "...")

## Step 4: Create Analysis DataFrame

Organize results into a pandas DataFrame for analysis

In [None]:
def create_analysis_dataframe(results_data):
    """Create DataFrame from results."""
    records = []
    
    for result in results_data:
        record = {'file': result.get('file', 'unknown')}
        
        # Extract temperature statistics
        if 'statistics' in result and 'temperature' in result['statistics']:
            temp = result['statistics']['temperature']
            record['temp_mean'] = temp.get('mean')
            record['temp_std'] = temp.get('std')
            record['temp_min'] = temp.get('min')
            record['temp_max'] = temp.get('max')
        
        # Extract precipitation statistics
        if 'statistics' in result and 'precipitation' in result['statistics']:
            precip = result['statistics']['precipitation']
            record['precip_mean'] = precip.get('mean')
            record['precip_std'] = precip.get('std')
            record['precip_total'] = precip.get('total')
        
        records.append(record)
    
    return pd.DataFrame(records)

# Create DataFrame
df = create_analysis_dataframe(results_data)
print(f"Analysis DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
display(df.head())

## Step 5: Summary Statistics

Calculate aggregate statistics across all processed files

In [None]:
print("\n" + "="*60)
print("CLIMATE DATA SUMMARY STATISTICS")
print("="*60)

print(f"\nTotal files processed: {len(df)}")

# Temperature statistics
if 'temp_mean' in df.columns:
    print("\nTemperature (K):")
    print(f"  Mean of means: {df['temp_mean'].mean():.2f} K")
    print(f"  Std dev: {df['temp_mean'].std():.2f} K")
    print(f"  Range: {df['temp_min'].min():.2f} - {df['temp_max'].max():.2f} K")

# Precipitation statistics
if 'precip_mean' in df.columns:
    print("\nPrecipitation (kg m-2 s-1):")
    print(f"  Mean of means: {df['precip_mean'].mean():.2e}")
    print(f"  Std dev: {df['precip_mean'].std():.2e}")
    print(f"  Total: {df['precip_total'].sum():.2e}")

print("\n" + "="*60)

## Step 6: Visualizations

Create publication-quality figures

In [None]:
# Create figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Climate Data Analysis Summary', fontsize=16, fontweight='bold')

# Temperature mean distribution
if 'temp_mean' in df.columns:
    axes[0, 0].hist(df['temp_mean'], bins=10, color='red', alpha=0.7, edgecolor='black')
    axes[0, 0].set_xlabel('Temperature Mean (K)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Temperature Distribution')
    axes[0, 0].grid(True, alpha=0.3)

# Temperature range
if 'temp_mean' in df.columns and 'temp_std' in df.columns:
    x = range(len(df))
    axes[0, 1].errorbar(x, df['temp_mean'], yerr=df['temp_std'], 
                        fmt='o', capsize=5, capthick=2, alpha=0.7)
    axes[0, 1].set_xlabel('File Index')
    axes[0, 1].set_ylabel('Temperature (K)')
    axes[0, 1].set_title('Temperature with Uncertainty')
    axes[0, 1].grid(True, alpha=0.3)

# Precipitation distribution
if 'precip_mean' in df.columns:
    axes[1, 0].hist(df['precip_mean'], bins=10, color='blue', alpha=0.7, edgecolor='black')
    axes[1, 0].set_xlabel('Precipitation Mean (kg m-2 s-1)')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Precipitation Distribution')
    axes[1, 0].grid(True, alpha=0.3)

# Summary table
axes[1, 1].axis('off')
summary_text = f"""Climate Analysis Summary
━━━━━━━━━━━━━━━━━━━━━━━━━
Total Files: {len(df)}

Temperature:
  Mean: {df['temp_mean'].mean():.2f} K
  Std:  {df['temp_mean'].std():.2f} K

Precipitation:
  Mean: {df['precip_mean'].mean():.2e}
  Total: {df['precip_total'].sum():.2e}
"""

axes[1, 1].text(0.1, 0.5, summary_text, fontfamily='monospace', 
                fontsize=10, verticalalignment='center')

plt.tight_layout()
plt.savefig('climate_analysis_summary.png', dpi=300, bbox_inches='tight')
print("✓ Figure saved: climate_analysis_summary.png")
plt.show()

## Step 7: Export Results

Save analysis results for archival

In [None]:
# Create output directory
output_dir = Path('./results')
output_dir.mkdir(exist_ok=True)

# Export DataFrame to CSV
csv_file = output_dir / 'analysis_results.csv'
df.to_csv(csv_file, index=False)
print(f"✓ CSV exported: {csv_file}")

# Export summary statistics
summary = {
    'total_files': len(df),
    'timestamp': pd.Timestamp.now().isoformat(),
    'temperature': {
        'mean': float(df['temp_mean'].mean()),
        'std': float(df['temp_mean'].std()),
        'min': float(df['temp_min'].min()),
        'max': float(df['temp_max'].max())
    },
    'precipitation': {
        'mean': float(df['precip_mean'].mean()),
        'std': float(df['precip_mean'].std()),
        'total': float(df['precip_total'].sum())
    }
}

summary_file = output_dir / 'summary.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"✓ Summary exported: {summary_file}")
print(f"\nSummary:")
print(json.dumps(summary, indent=2))

## Step 8: Next Steps

What to do next after this analysis

In [None]:
print("""
✓ Analysis Complete!

Next Steps:

1. REVIEW RESULTS
   - Check the figures and CSV output
   - Compare statistics across models
   - Identify patterns in the data

2. EXTEND THE ANALYSIS
   - Add more climate variables (e.g., humidity)
   - Calculate regional anomalies
   - Perform trend analysis

3. SHARE RESULTS
   - Upload figures to S3 for collaboration
   - Generate detailed reports
   - Create visualizations for presentations

4. CLEAN UP AWS RESOURCES
   - Follow cleanup_guide.md
   - Delete S3 bucket and Lambda function
   - Stop incurring charges

5. MOVE TO TIER 3 (OPTIONAL)
   - Production-grade CloudFormation templates
   - Automated workflows
   - Multi-region deployment

See README.md for more information!
""")