# Genomic Variant Analysis with AWS Lambda and DynamoDB

**Duration:** 60-90 minutes

This notebook demonstrates:
1. Uploading BAM files to S3
2. Triggering Lambda functions for variant calling
3. Querying results from DynamoDB
4. Visualizing variant statistics

## Prerequisites

- AWS credentials configured in `.env`
- All AWS resources created (see `../setup_guide.md`)
- Python dependencies installed (`pip install -r ../requirements.txt`)

In [None]:
# Setup
%matplotlib inline

import os
import sys
import json
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
from typing import Dict, List

# Load environment variables
load_dotenv()

# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# AWS configuration
AWS_REGION = os.getenv('AWS_REGION', 'us-east-1')
BUCKET_INPUT = os.getenv('BUCKET_INPUT')
BUCKET_RESULTS = os.getenv('BUCKET_RESULTS')
TABLE_NAME = os.getenv('TABLE_NAME', 'variant-metadata')
LAMBDA_FUNCTION = os.getenv('LAMBDA_FUNCTION', 'variant-calling')

print(f"AWS Region: {AWS_REGION}")
print(f"Input Bucket: {BUCKET_INPUT}")
print(f"Results Bucket: {BUCKET_RESULTS}")
print(f"DynamoDB Table: {TABLE_NAME}")
print(f"Lambda Function: {LAMBDA_FUNCTION}")

## Step 1: Upload Sample Data to S3

First, we'll upload BAM files and reference data to our S3 bucket.

In [None]:
# Create S3 client
s3_client = boto3.client('s3', region_name=AWS_REGION)

# List what's already in the bucket
print("Current S3 bucket contents:")
response = s3_client.list_objects_v2(Bucket=BUCKET_INPUT, MaxKeys=100)

if 'Contents' in response:
    for obj in response['Contents']:
        size_mb = obj['Size'] / (1024 * 1024)
        print(f"  {obj['Key']:<50} {size_mb:>10.2f} MB")
else:
    print("  Bucket is empty. Run upload_to_s3.py to add sample data.")

### Upload sample data if needed

In [None]:
# If bucket is empty, run upload script
# Uncomment the line below to run the upload script

# import subprocess
# subprocess.run(['python', '../scripts/upload_to_s3.py', '--synthetic'])

## Step 2: Invoke Lambda Function

Now we'll trigger the Lambda function to call variants on a specific genomic region.

In [None]:
# Create Lambda client
lambda_client = boto3.client('lambda', region_name=AWS_REGION)

# Define the region to analyze
genomic_region = 'chr20:1000000-1010000'
sample_id = 'NA12878'

# Create the event payload
event = {
    'bucket': BUCKET_INPUT,
    'key': 'samples/NA12878.chr20.bam',  # Adjust if your file has different name
    'region': genomic_region,
    'sample_id': sample_id
}

print(f"Invoking Lambda function: {LAMBDA_FUNCTION}")
print(f"Event payload: {json.dumps(event, indent=2)}")
print()
print("Calling variant-calling Lambda function...")

# Invoke Lambda
try:
    response = lambda_client.invoke(
        FunctionName=LAMBDA_FUNCTION,
        InvocationType='RequestResponse',  # Wait for response
        Payload=json.dumps(event)
    )
    
    # Parse response
    payload = json.loads(response['Payload'].read())
    
    print(f"Response: {json.dumps(payload, indent=2)}")
    
    # Extract results
    if payload.get('statusCode') == 200:
        result = payload.get('body', {})
        print(f"\n✓ Lambda execution successful!")
        print(f"  Variants called: {result.get('variants', 0)}")
        print(f"  VCF file: {result.get('vcf_file', 'N/A')}")
    else:
        print(f"\n✗ Lambda execution failed")
        print(f"  Error: {payload.get('body', {}).get('error', 'Unknown')}")

except Exception as e:
    print(f"✗ Error invoking Lambda: {str(e)}")

## Step 3: Query DynamoDB for Variant Metadata

After Lambda execution, variant metadata is stored in DynamoDB. Let's query it.

In [None]:
# Create DynamoDB resource
dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
table = dynamodb.Table(TABLE_NAME)

print(f"Querying DynamoDB table: {TABLE_NAME}")
print(f"Sample: {sample_id}")
print()

# Query DynamoDB
try:
    response = table.scan(
        FilterExpression='sample_id = :sample',
        ExpressionAttributeValues={':sample': sample_id}
    )
    
    items = response.get('Items', [])
    
    # Handle pagination
    while 'LastEvaluatedKey' in response:
        response = table.scan(
            FilterExpression='sample_id = :sample',
            ExpressionAttributeValues={':sample': sample_id},
            ExclusiveStartKey=response['LastEvaluatedKey']
        )
        items.extend(response.get('Items', []))
    
    print(f"Retrieved {len(items)} variants from DynamoDB")
    
    # Convert to DataFrame
    if items:
        df_variants = pd.DataFrame(items)
        print(f"\nVariants DataFrame shape: {df_variants.shape}")
        print(f"\nFirst few variants:")
        print(df_variants[['chromosome', 'position', 'ref', 'alt', 'quality', 'depth']].head())
    else:
        print("No variants found in DynamoDB. Check Lambda logs for errors.")
        df_variants = pd.DataFrame()

except Exception as e:
    print(f"✗ Error querying DynamoDB: {str(e)}")
    df_variants = pd.DataFrame()

## Step 4: Variant Summary Statistics

In [None]:
if not df_variants.empty:
    print("=" * 60)
    print("Variant Summary Statistics")
    print("=" * 60)
    
    # Basic statistics
    print(f"\nTotal variants: {len(df_variants)}")
    print(f"Unique chromosomes: {df_variants['chromosome'].nunique()}")
    
    # Variant types
    df_variants['var_type'] = df_variants.apply(
        lambda x: 'SNP' if len(x['ref']) == len(x['alt']) else 'INDEL',
        axis=1
    )
    
    print(f"\nVariant types:")
    print(df_variants['var_type'].value_counts())
    
    # Quality statistics
    print(f"\nQuality metrics:")
    print(f"  Mean: {df_variants['quality'].mean():.2f}")
    print(f"  Median: {df_variants['quality'].median():.2f}")
    print(f"  Min: {df_variants['quality'].min():.2f}")
    print(f"  Max: {df_variants['quality'].max():.2f}")
    
    # Depth statistics
    print(f"\nDepth metrics:")
    print(f"  Mean: {df_variants['depth'].mean():.2f}")
    print(f"  Median: {df_variants['depth'].median():.2f}")
    print(f"  Min: {df_variants['depth'].min():.2f}")
    print(f"  Max: {df_variants['depth'].max():.2f}")
    
    # Allele frequency statistics
    print(f"\nAllele frequency metrics:")
    print(f"  Mean: {df_variants['allele_freq'].mean():.4f}")
    print(f"  Median: {df_variants['allele_freq'].median():.4f}")
    
    print("=" * 60)
else:
    print("No data available for statistics")

## Step 5: Visualizations

In [None]:
if not df_variants.empty:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Plot 1: Quality distribution
    axes[0, 0].hist(df_variants['quality'], bins=20, edgecolor='black', alpha=0.7)
    axes[0, 0].set_xlabel('Quality Score')
    axes[0, 0].set_ylabel('Number of Variants')
    axes[0, 0].set_title('Quality Score Distribution')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Plot 2: Depth distribution
    axes[0, 1].hist(df_variants['depth'], bins=20, edgecolor='black', alpha=0.7, color='green')
    axes[0, 1].set_xlabel('Depth (Read Coverage)')
    axes[0, 1].set_ylabel('Number of Variants')
    axes[0, 1].set_title('Depth Distribution')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Plot 3: Variant types
    var_type_counts = df_variants['var_type'].value_counts()
    colors = ['#FF6B6B', '#4ECDC4']
    axes[1, 0].bar(var_type_counts.index, var_type_counts.values, color=colors, edgecolor='black')
    axes[1, 0].set_xlabel('Variant Type')
    axes[1, 0].set_ylabel('Count')
    axes[1, 0].set_title('Variant Type Distribution')
    axes[1, 0].grid(True, alpha=0.3, axis='y')
    
    # Plot 4: Quality vs Allele Frequency
    scatter = axes[1, 1].scatter(df_variants['allele_freq'], df_variants['quality'], 
                                  c=df_variants['depth'], cmap='viridis', s=100, alpha=0.6)
    axes[1, 1].set_xlabel('Allele Frequency')
    axes[1, 1].set_ylabel('Quality Score')
    axes[1, 1].set_title('Quality vs Allele Frequency (colored by depth)')
    cbar = plt.colorbar(scatter, ax=axes[1, 1])
    cbar.set_label('Depth')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../results_visualization.png', dpi=100, bbox_inches='tight')
    print("✓ Visualization saved to results_visualization.png")
    plt.show()
else:
    print("No data available for visualization")

## Step 6: Download VCF Results from S3

In [None]:
# List VCF files in results bucket
print(f"Listing VCF files in s3://{BUCKET_RESULTS}/results/")
print()

try:
    response = s3_client.list_objects_v2(
        Bucket=BUCKET_RESULTS,
        Prefix='results/'
    )
    
    vcf_files = []
    if 'Contents' in response:
        for obj in response['Contents']:
            if obj['Key'].endswith('.vcf'):
                vcf_files.append(obj['Key'])
                size_kb = obj['Size'] / 1024
                print(f"  {obj['Key']:<60} {size_kb:>10.2f} KB")
    
    if not vcf_files:
        print("  No VCF files found. Check Lambda logs for errors.")
    else:
        print(f"\n✓ Found {len(vcf_files)} VCF file(s)")
        
        # Download the first VCF
        vcf_key = vcf_files[0]
        local_vcf_path = f"../downloads/{Path(vcf_key).name}"
        
        print(f"\nDownloading: {vcf_key}")
        os.makedirs('../downloads', exist_ok=True)
        s3_client.download_file(BUCKET_RESULTS, vcf_key, local_vcf_path)
        print(f"✓ Downloaded to: {local_vcf_path}")
        
        # Display VCF content
        print(f"\nVCF file contents:")
        with open(local_vcf_path, 'r') as f:
            lines = f.readlines()
            for line in lines[:20]:  # Show first 20 lines
                print(line.rstrip())
            if len(lines) > 20:
                print(f"... ({len(lines) - 20} more lines)")

except Exception as e:
    print(f"✗ Error: {str(e)}")

## Step 7: Export Results to CSV

In [None]:
if not df_variants.empty:
    # Export to CSV
    output_csv = f"../variant_results_{sample_id}.csv"
    df_variants.to_csv(output_csv, index=False)
    print(f"✓ Exported {len(df_variants)} variants to {output_csv}")
    
    # Show first few rows
    print(f"\nFirst 5 variants:")
    print(df_variants[['chromosome', 'position', 'ref', 'alt', 'quality', 'depth', 'allele_freq']].head())
    
    # Show data types
    print(f"\nDataFrame info:")
    print(df_variants.info())
else:
    print("No data to export")

## Step 8: Analyze Quality and Coverage

In [None]:
if not df_variants.empty:
    # Create quality categories
    df_variants['quality_category'] = pd.cut(
        df_variants['quality'],
        bins=[0, 20, 40, 60, 100],
        labels=['Low', 'Medium', 'High', 'Very High']
    )
    
    print("Quality categories:")
    print(df_variants['quality_category'].value_counts().sort_index())
    print()
    
    # Create depth categories
    df_variants['depth_category'] = pd.cut(
        df_variants['depth'],
        bins=[0, 10, 20, 50, 1000],
        labels=['Low', 'Medium', 'High', 'Very High']
    )
    
    print("Depth categories:")
    print(df_variants['depth_category'].value_counts().sort_index())
else:
    print("No data available for analysis")

## Step 9: Performance Summary

In [None]:
print("\n" + "=" * 60)
print("Analysis Complete!")
print("=" * 60)

if not df_variants.empty:
    print(f"\nResults Summary:")
    print(f"  Sample: {sample_id}")
    print(f"  Region: {genomic_region}")
    print(f"  Variants Found: {len(df_variants)}")
    print(f"  SNPs: {(df_variants['var_type'] == 'SNP').sum()}")
    print(f"  Indels: {(df_variants['var_type'] == 'INDEL').sum()}")
    print(f"  Mean Quality: {df_variants['quality'].mean():.2f}")
    print(f"  Mean Depth: {df_variants['depth'].mean():.2f}")
    
    print(f"\nOutputs:")
    print(f"  - CSV: ../variant_results_{sample_id}.csv")
    print(f"  - Visualization: ../results_visualization.png")
    print(f"  - VCF: ../downloads/")
else:
    print(f"\nNo variants found. Check:")
    print(f"  1. Lambda logs in CloudWatch")
    print(f"  2. S3 bucket contains BAM files")
    print(f"  3. DynamoDB table exists and is accessible")

print(f"\nNext Steps:")
print(f"  1. Review results in CSV and visualizations")
print(f"  2. Download VCF files for further analysis")
print(f"  3. Run cleanup_guide.md to delete AWS resources")

print("\n" + "=" * 60)