# Phase 1: CloudTrail S3 to EC2 Data Pipeline

This notebook orchestrates the download, decompression, and validation of CloudTrail logs from S3.

In [None]:
import sys
from pathlib import Path

# Add src to path
src_path = Path('../src').resolve()
sys.path.insert(0, str(src_path))

from tqdm.notebook import tqdm
import json

from common.config import Phase1Config, S3Config, AuthConfig
from common.logging_config import setup_logging
from phase1.s3_reader import S3CloudTrailReader
from phase1.decompressor import FileDecompressor
from phase1.validator import IntegrityValidator

## Configuration Setup

In [None]:
# Setup logging
logger = setup_logging(log_level="INFO", log_file="../data/reports/phase1.log")

# Configuration
config_dict = {
    's3_config': {
        'bucket_arn': 'arn:aws:s3:::aws-cloudtrail-logs-123456789112-72b9c5ab-ctrail-cwlog-bucket-name',
        'prefix': 'AWSLogs/123456789112/CloudTrail/ap-south-1',
        'start_date': '2025/07/25',
        'end_date': '2025/07/27'  # Single day for testing
    },
    'auth_config': {
        'method': 'instance_profile'  # Change to 'access_keys' if needed
    },
    'local_base_path': '../data'
}

config = Phase1Config.from_dict(config_dict)
print(f"Configuration loaded for bucket: {config.s3_config.bucket_name}")

## Step 1: List and Download CloudTrail Files

In [None]:
# Initialize S3 reader
s3_reader = S3CloudTrailReader(config.s3_config, config.auth_config)

# List objects
print("Listing CloudTrail objects...")
s3_objects = s3_reader.list_objects()
print(f"Found {len(s3_objects)} files to download")

# Display first few objects
for obj in s3_objects[:3]:
    print(f"  {obj['key']} ({obj['size']} bytes)")

In [None]:
# Download files
raw_path = Path(config.local_base_path) / 'raw'
download_results = []

print("Downloading files...")
for obj in tqdm(s3_objects, desc="Downloading"):
    local_file = raw_path / obj['key']
    
    success = s3_reader.download_file(obj['key'], local_file)
    if success:
        validated = s3_reader.validate_download(
            obj['key'], local_file, obj['size'], obj['etag']
        )
        download_results.append({
            'key': obj['key'],
            'downloaded': success,
            'validated': validated
        })
    else:
        download_results.append({
            'key': obj['key'],
            'downloaded': False,
            'validated': False
        })

successful_downloads = sum(1 for r in download_results if r['downloaded'])
print(f"Successfully downloaded {successful_downloads}/{len(s3_objects)} files")

## Step 2: Decompress Files

In [None]:
# Initialize decompressor
decompressor = FileDecompressor()
processed_path = Path(config.local_base_path) / 'processed'

decompression_results = []

print("Decompressing files...")
for result in tqdm(download_results, desc="Decompressing"):
    if not result['downloaded']:
        continue
    
    gz_file = raw_path / result['key']
    json_file = processed_path / result['key'].replace('.gz', '.json')
    
    decomp_success = decompressor.decompress_file(gz_file, json_file)
    if decomp_success:
        json_valid = decompressor.validate_json(json_file)
        decompression_results.append({
            'key': result['key'],
            'decompressed': decomp_success,
            'json_valid': json_valid
        })
    else:
        decompression_results.append({
            'key': result['key'],
            'decompressed': False,
            'json_valid': False
        })

successful_decompressions = sum(1 for r in decompression_results if r['decompressed'])
print(f"Successfully decompressed {successful_decompressions}/{successful_downloads} files")

## Step 3: Integrity Validation and Reporting

In [None]:
# Initialize validator
validator = IntegrityValidator(s3_objects, raw_path, processed_path)

# Run validation
print("Running integrity validation...")
validation_result = validator.validate()

# Generate report
report_path = Path(config.local_base_path) / 'reports' / 'integrity_report.json'
validator.generate_report(validation_result, report_path)

print(f"\nValidation Summary:")
print(f"  Total S3 files: {validation_result.total_s3_files}")
print(f"  Downloaded files: {validation_result.downloaded_files}")
print(f"  Processed files: {validation_result.decompressed_files}")
print(f"  Success rate: {validation_result.success_rate:.2f}%")
print(f"\nReport saved to: {report_path}")

## Summary and Next Steps

In [None]:
print("Phase 1 Pipeline Complete!")
print(f"\nProcessed data location: {processed_path}")
print(f"Ready for Phase 2 analysis with DuckDB")

# Show sample of processed data structure
json_files = list(processed_path.rglob('*.json'))
if json_files:
    print(f"\nSample processed files:")
    for f in json_files[:3]:
        print(f"  {f.relative_to(processed_path)}")