# Transportation Flow Analysis with AWS

This notebook demonstrates traffic flow analysis using AWS services:
- Generate synthetic traffic data for urban network
- Upload data to S3
- Trigger Lambda processing
- Query results from DynamoDB
- Visualize traffic patterns and congestion

**Prerequisites:**
- AWS account configured
- S3 bucket created
- Lambda function deployed
- DynamoDB table created

## Setup and Imports

In [None]:
import os
import json
import time
from datetime import datetime, timedelta
from pathlib import Path

import boto3
from boto3.dynamodb.conditions import Key, Attr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Imports complete!")

## Configuration

In [None]:
# AWS Configuration
AWS_REGION = os.getenv('AWS_REGION', 'us-east-1')
S3_BUCKET = os.getenv('S3_BUCKET_NAME', 'transportation-data-demo')
S3_PREFIX = os.getenv('S3_RAW_PREFIX', 'raw/')
DYNAMODB_TABLE = os.getenv('DYNAMODB_TABLE_NAME', 'TrafficAnalysis')
LAMBDA_FUNCTION = os.getenv('LAMBDA_FUNCTION_NAME', 'analyze-traffic-flow')

# Initialize AWS clients
s3_client = boto3.client('s3', region_name=AWS_REGION)
lambda_client = boto3.client('lambda', region_name=AWS_REGION)
dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)

print(f"AWS Configuration:")
print(f"  Region: {AWS_REGION}")
print(f"  S3 Bucket: {S3_BUCKET}")
print(f"  DynamoDB Table: {DYNAMODB_TABLE}")
print(f"  Lambda Function: {LAMBDA_FUNCTION}")

## 1. Generate Sample Traffic Data

Create synthetic traffic data for a simple urban network.

In [None]:
def generate_urban_network(n_segments=20, seed=42):
    """
    Generate a simple urban road network.
    
    Returns:
        DataFrame with segment information
    """
    np.random.seed(seed)
    
    # Generate grid network centered around San Francisco
    base_lat = 37.7749
    base_lon = -122.4194
    
    segments = []
    
    for i in range(n_segments):
        segment = {
            'segment_id': f'SEG-{i+1:03d}',
            'latitude': base_lat + np.random.uniform(-0.05, 0.05),
            'longitude': base_lon + np.random.uniform(-0.05, 0.05),
            'speed_limit': np.random.choice([35, 45, 55, 65]),
            'lanes': np.random.choice([2, 3, 4]),
            'capacity': np.random.choice([2000, 2500, 3000]) * np.random.choice([2, 3, 4])
        }
        segments.append(segment)
    
    return pd.DataFrame(segments)

# Generate network
network_df = generate_urban_network(n_segments=20)
print(f"Generated urban network with {len(network_df)} segments")
network_df.head()

In [None]:
def generate_traffic_data(network_df, start_time, duration_hours=1, interval_minutes=5):
    """
    Generate realistic traffic data for the network.
    
    Args:
        network_df: DataFrame with segment information
        start_time: Start datetime
        duration_hours: Duration in hours
        interval_minutes: Data collection interval
        
    Returns:
        DataFrame with traffic observations
    """
    observations = []
    
    # Generate timestamps
    timestamps = pd.date_range(
        start=start_time,
        periods=int(duration_hours * 60 / interval_minutes),
        freq=f'{interval_minutes}T'
    )
    
    for timestamp in timestamps:
        hour = timestamp.hour
        
        # Peak hours: higher traffic
        is_peak = (7 <= hour < 9) or (17 <= hour < 19)
        base_multiplier = 1.5 if is_peak else 0.8
        
        for _, segment in network_df.iterrows():
            # Simulate traffic volume
            base_volume = segment['capacity'] * base_multiplier * np.random.uniform(0.5, 1.2)
            vehicle_count = int(base_volume * (interval_minutes / 60))  # Convert to interval
            
            # Calculate speed based on congestion
            vc_ratio = vehicle_count / (segment['capacity'] * (interval_minutes / 60))
            
            # Speed decreases as V/C ratio increases
            if vc_ratio < 0.5:
                avg_speed = segment['speed_limit'] * np.random.uniform(0.95, 1.0)
            elif vc_ratio < 0.8:
                avg_speed = segment['speed_limit'] * np.random.uniform(0.7, 0.9)
            elif vc_ratio < 1.0:
                avg_speed = segment['speed_limit'] * np.random.uniform(0.5, 0.7)
            else:
                avg_speed = segment['speed_limit'] * np.random.uniform(0.3, 0.5)
            
            # Occupancy (percentage of road occupied)
            occupancy = min(0.95, vc_ratio * 0.8)
            
            # Congestion level (0-5 scale)
            if vc_ratio < 0.5:
                congestion_level = 0
            elif vc_ratio < 0.7:
                congestion_level = 1
            elif vc_ratio < 0.85:
                congestion_level = 2
            elif vc_ratio < 1.0:
                congestion_level = 3
            elif vc_ratio < 1.2:
                congestion_level = 4
            else:
                congestion_level = 5
            
            observation = {
                'timestamp': timestamp.isoformat(),
                'segment_id': segment['segment_id'],
                'latitude': segment['latitude'],
                'longitude': segment['longitude'],
                'vehicle_count': vehicle_count,
                'avg_speed': round(avg_speed, 2),
                'occupancy': round(occupancy, 3),
                'congestion_level': congestion_level,
                'speed_limit': segment['speed_limit'],
                'capacity': segment['capacity'],
                'lanes': segment['lanes']
            }
            observations.append(observation)
    
    return pd.DataFrame(observations)

# Generate traffic data for peak morning hours
start_time = datetime.now().replace(hour=7, minute=0, second=0, microsecond=0)
traffic_df = generate_traffic_data(network_df, start_time, duration_hours=3, interval_minutes=5)

print(f"Generated {len(traffic_df)} traffic observations")
print(f"Time range: {traffic_df['timestamp'].min()} to {traffic_df['timestamp'].max()}")
traffic_df.head(10)

### Visualize Generated Data

In [None]:
# Plot speed distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Speed distribution
axes[0, 0].hist(traffic_df['avg_speed'], bins=30, edgecolor='black')
axes[0, 0].set_xlabel('Average Speed (mph)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Speed Distribution')

# Vehicle count distribution
axes[0, 1].hist(traffic_df['vehicle_count'], bins=30, edgecolor='black', color='orange')
axes[0, 1].set_xlabel('Vehicle Count')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Vehicle Count Distribution')

# Congestion level distribution
congestion_counts = traffic_df['congestion_level'].value_counts().sort_index()
axes[1, 0].bar(congestion_counts.index, congestion_counts.values, color='red', alpha=0.7)
axes[1, 0].set_xlabel('Congestion Level')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Congestion Level Distribution')

# Average speed over time for sample segment
sample_segment = traffic_df[traffic_df['segment_id'] == 'SEG-001'].copy()
sample_segment['time'] = pd.to_datetime(sample_segment['timestamp'])
axes[1, 1].plot(sample_segment['time'], sample_segment['avg_speed'], marker='o', color='green')
axes[1, 1].set_xlabel('Time')
axes[1, 1].set_ylabel('Average Speed (mph)')
axes[1, 1].set_title('Speed Over Time (SEG-001)')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 2. Save Data to CSV and Upload to S3

In [None]:
# Save to CSV
output_dir = Path('../data')
output_dir.mkdir(exist_ok=True)

timestamp_str = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_filename = f'traffic_data_{timestamp_str}.csv'
csv_path = output_dir / csv_filename

traffic_df.to_csv(csv_path, index=False)
print(f"Saved traffic data to: {csv_path}")
print(f"File size: {csv_path.stat().st_size / 1024:.2f} KB")

In [None]:
# Upload to S3
s3_key = f"{S3_PREFIX}{csv_filename}"

try:
    s3_client.upload_file(
        str(csv_path),
        S3_BUCKET,
        s3_key,
        ExtraArgs={
            'ContentType': 'text/csv',
            'Metadata': {
                'uploaded_at': datetime.utcnow().isoformat(),
                'records': str(len(traffic_df))
            }
        }
    )
    print(f"✓ Uploaded to S3: s3://{S3_BUCKET}/{s3_key}")
    print(f"  Lambda will automatically process this file.")
except Exception as e:
    print(f"✗ Upload failed: {str(e)}")

## 3. Wait for Lambda Processing

Lambda is triggered automatically by S3 upload. Wait for processing to complete.

In [None]:
def check_lambda_processing(bucket, key, max_wait_seconds=60):
    """
    Check if Lambda has processed the file by looking for results in S3.
    """
    results_key = key.replace('raw/', 'results/').replace('.csv', '_analysis.json')
    
    print(f"Waiting for Lambda to process file...")
    print(f"Checking for results at: s3://{bucket}/{results_key}")
    
    for i in range(max_wait_seconds):
        try:
            s3_client.head_object(Bucket=bucket, Key=results_key)
            print(f"\n✓ Processing complete! Results found after {i+1} seconds.")
            return True
        except:
            if i % 5 == 0:
                print(f"  Waiting... ({i+1}s)", end='\r')
            time.sleep(1)
    
    print(f"\n⚠ Processing did not complete within {max_wait_seconds} seconds.")
    print("  Check CloudWatch logs for errors.")
    return False

# Wait for processing
processing_complete = check_lambda_processing(S3_BUCKET, s3_key, max_wait_seconds=60)

## 4. Query Results from DynamoDB

In [None]:
def query_dynamodb_results(table_name, limit=1000):
    """
    Query all recent results from DynamoDB.
    """
    table = dynamodb.Table(table_name)
    
    try:
        response = table.scan(Limit=limit)
        items = response.get('Items', [])
        
        # Handle pagination
        while 'LastEvaluatedKey' in response and len(items) < limit:
            response = table.scan(
                Limit=limit - len(items),
                ExclusiveStartKey=response['LastEvaluatedKey']
            )
            items.extend(response.get('Items', []))
        
        print(f"Retrieved {len(items)} records from DynamoDB")
        return pd.DataFrame(items)
        
    except Exception as e:
        print(f"Error querying DynamoDB: {str(e)}")
        return pd.DataFrame()

# Query results
results_df = query_dynamodb_results(DYNAMODB_TABLE)

if not results_df.empty:
    print(f"\nResults summary:")
    print(f"  Total records: {len(results_df)}")
    print(f"  Unique segments: {results_df['segment_id'].nunique()}")
    print(f"  Time range: {results_df['timestamp_iso'].min()} to {results_df['timestamp_iso'].max()}")
    results_df.head()

## 5. Analyze Traffic Metrics

In [None]:
if not results_df.empty:
    # Convert numeric columns
    numeric_cols = ['avg_speed', 'vc_ratio', 'travel_time_index', 'reliability_score']
    for col in numeric_cols:
        if col in results_df.columns:
            results_df[col] = pd.to_numeric(results_df[col])
    
    # Summary statistics
    print("="*70)
    print("TRAFFIC ANALYSIS SUMMARY")
    print("="*70)
    print(f"\nOverall Metrics:")
    print(f"  Average Speed: {results_df['avg_speed'].mean():.2f} mph")
    print(f"  Average V/C Ratio: {results_df['vc_ratio'].mean():.3f}")
    print(f"  Average Travel Time Index: {results_df['travel_time_index'].mean():.3f}")
    print(f"  Congestion Rate: {results_df['is_congested'].sum() / len(results_df) * 100:.1f}%")
    
    # Level of Service distribution
    print(f"\nLevel of Service Distribution:")
    los_dist = results_df['los'].value_counts().sort_index()
    for los, count in los_dist.items():
        pct = count / len(results_df) * 100
        print(f"  LOS {los}: {count} ({pct:.1f}%)")
    
    # Top congested segments
    print(f"\nTop 5 Congested Segments:")
    top_congested = results_df.nlargest(5, 'vc_ratio')[['segment_id', 'vc_ratio', 'los', 'avg_speed']]
    for _, row in top_congested.iterrows():
        print(f"  {row['segment_id']}: V/C={row['vc_ratio']:.3f}, LOS={row['los']}, Speed={row['avg_speed']:.1f} mph")
else:
    print("No results available for analysis.")

## 6. Visualize Results

In [None]:
if not results_df.empty:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    
    # 1. LOS Distribution
    los_counts = results_df['los'].value_counts().sort_index()
    colors = ['green', 'lightgreen', 'yellow', 'orange', 'red', 'darkred']
    axes[0, 0].bar(los_counts.index, los_counts.values, color=colors[:len(los_counts)])
    axes[0, 0].set_xlabel('Level of Service')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].set_title('Level of Service Distribution')
    
    # 2. V/C Ratio Distribution
    axes[0, 1].hist(results_df['vc_ratio'], bins=30, edgecolor='black', color='skyblue')
    axes[0, 1].axvline(0.8, color='red', linestyle='--', label='Congestion Threshold')
    axes[0, 1].set_xlabel('V/C Ratio')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Volume/Capacity Ratio Distribution')
    axes[0, 1].legend()
    
    # 3. Speed vs V/C Ratio
    axes[0, 2].scatter(results_df['vc_ratio'], results_df['avg_speed'], 
                      c=results_df['vc_ratio'], cmap='RdYlGn_r', alpha=0.6)
    axes[0, 2].set_xlabel('V/C Ratio')
    axes[0, 2].set_ylabel('Average Speed (mph)')
    axes[0, 2].set_title('Speed vs Congestion')
    
    # 4. Travel Time Index
    axes[1, 0].hist(results_df['travel_time_index'], bins=30, edgecolor='black', color='coral')
    axes[1, 0].axvline(1.0, color='green', linestyle='--', label='Free Flow')
    axes[1, 0].set_xlabel('Travel Time Index')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Travel Time Index Distribution')
    axes[1, 0].legend()
    
    # 5. Congestion by Segment
    segment_congestion = results_df.groupby('segment_id')['is_congested'].mean().sort_values(ascending=False).head(10)
    axes[1, 1].barh(range(len(segment_congestion)), segment_congestion.values, color='red', alpha=0.7)
    axes[1, 1].set_yticks(range(len(segment_congestion)))
    axes[1, 1].set_yticklabels(segment_congestion.index)
    axes[1, 1].set_xlabel('Congestion Rate')
    axes[1, 1].set_title('Top 10 Congested Segments')
    
    # 6. Reliability Score
    axes[1, 2].hist(results_df['reliability_score'], bins=30, edgecolor='black', color='purple', alpha=0.7)
    axes[1, 2].set_xlabel('Reliability Score')
    axes[1, 2].set_ylabel('Frequency')
    axes[1, 2].set_title('Travel Time Reliability')
    
    plt.tight_layout()
    plt.savefig('../data/traffic_analysis_summary.png', dpi=300, bbox_inches='tight')
    print("Figure saved to: ../data/traffic_analysis_summary.png")
    plt.show()
else:
    print("No results available for visualization.")

## 7. Network Visualization

In [None]:
if not results_df.empty:
    # Create network graph
    G = nx.Graph()
    
    # Add nodes for each segment
    for segment_id in results_df['segment_id'].unique():
        segment_data = results_df[results_df['segment_id'] == segment_id].iloc[0]
        G.add_node(segment_id,
                  pos=(float(segment_data['longitude']), float(segment_data['latitude'])),
                  avg_vc=float(results_df[results_df['segment_id'] == segment_id]['vc_ratio'].mean()))
    
    # Add edges (simplified - connect nearby nodes)
    nodes = list(G.nodes())
    for i, node1 in enumerate(nodes):
        for node2 in nodes[i+1:i+4]:  # Connect to 3 nearest neighbors
            if node2 in G.nodes():
                G.add_edge(node1, node2)
    
    # Create visualization
    plt.figure(figsize=(14, 10))
    
    # Get positions and colors
    pos = nx.get_node_attributes(G, 'pos')
    vc_ratios = [G.nodes[node]['avg_vc'] for node in G.nodes()]
    
    # Draw network
    nx.draw_networkx_edges(G, pos, alpha=0.3, width=2)
    nodes = nx.draw_networkx_nodes(G, pos, node_color=vc_ratios, 
                                   node_size=500, cmap='RdYlGn_r',
                                   vmin=0, vmax=1.5)
    nx.draw_networkx_labels(G, pos, font_size=8)
    
    # Add colorbar
    plt.colorbar(nodes, label='Average V/C Ratio')
    plt.title('Urban Traffic Network - Congestion Heatmap', fontsize=16)
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.axis('on')
    plt.grid(True, alpha=0.3)
    
    plt.savefig('../data/network_heatmap.png', dpi=300, bbox_inches='tight')
    print("Network heatmap saved to: ../data/network_heatmap.png")
    plt.show()
else:
    print("No results available for network visualization.")

## 8. Time Series Analysis

In [None]:
if not results_df.empty and 'timestamp_iso' in results_df.columns:
    # Select a few representative segments
    sample_segments = results_df['segment_id'].unique()[:5]
    
    fig, axes = plt.subplots(len(sample_segments), 1, figsize=(14, 3*len(sample_segments)))
    
    if len(sample_segments) == 1:
        axes = [axes]
    
    for idx, segment_id in enumerate(sample_segments):
        segment_data = results_df[results_df['segment_id'] == segment_id].copy()
        segment_data['time'] = pd.to_datetime(segment_data['timestamp_iso'])
        segment_data = segment_data.sort_values('time')
        
        # Plot speed and V/C ratio
        ax1 = axes[idx]
        ax2 = ax1.twinx()
        
        line1 = ax1.plot(segment_data['time'], segment_data['avg_speed'], 
                        'b-o', label='Speed', markersize=4)
        line2 = ax2.plot(segment_data['time'], segment_data['vc_ratio'], 
                        'r-s', label='V/C Ratio', markersize=4)
        
        ax1.set_xlabel('Time')
        ax1.set_ylabel('Speed (mph)', color='b')
        ax2.set_ylabel('V/C Ratio', color='r')
        ax1.set_title(f'Traffic Metrics Over Time - {segment_id}')
        ax1.tick_params(axis='y', labelcolor='b')
        ax2.tick_params(axis='y', labelcolor='r')
        ax1.grid(True, alpha=0.3)
        
        # Add legend
        lines = line1 + line2
        labels = [l.get_label() for l in lines]
        ax1.legend(lines, labels, loc='upper right')
    
    plt.tight_layout()
    plt.savefig('../data/time_series_analysis.png', dpi=300, bbox_inches='tight')
    print("Time series plot saved to: ../data/time_series_analysis.png")
    plt.show()
else:
    print("Insufficient data for time series analysis.")

## 9. Export Results

In [None]:
if not results_df.empty:
    # Export to CSV
    output_csv = f'../data/analysis_results_{timestamp_str}.csv'
    results_df.to_csv(output_csv, index=False)
    print(f"Results exported to: {output_csv}")
    
    # Create summary report
    summary = {
        'analysis_timestamp': datetime.now().isoformat(),
        'total_records': len(results_df),
        'unique_segments': results_df['segment_id'].nunique(),
        'metrics': {
            'avg_speed': float(results_df['avg_speed'].mean()),
            'avg_vc_ratio': float(results_df['vc_ratio'].mean()),
            'avg_tti': float(results_df['travel_time_index'].mean()),
            'congestion_rate': float(results_df['is_congested'].sum() / len(results_df))
        },
        'los_distribution': results_df['los'].value_counts().to_dict(),
        'top_congested_segments': results_df.nlargest(5, 'vc_ratio')[['segment_id', 'vc_ratio']].to_dict('records')
    }
    
    output_json = f'../data/analysis_summary_{timestamp_str}.json'
    with open(output_json, 'w') as f:
        json.dump(summary, f, indent=2)
    print(f"Summary report exported to: {output_json}")
else:
    print("No results to export.")

## Summary

This notebook demonstrated:

1. ✓ Generated synthetic traffic data for urban network
2. ✓ Uploaded data to S3
3. ✓ Lambda automatically processed the data
4. ✓ Queried results from DynamoDB
5. ✓ Analyzed traffic metrics (LOS, V/C ratio, TTI)
6. ✓ Visualized congestion patterns and network
7. ✓ Exported results for further analysis

### Next Steps

- Analyze multiple time periods to identify patterns
- Implement predictive congestion models
- Integrate with real traffic data sources
- Create real-time monitoring dashboard
- Move to Tier 3 for production deployment