# Retail Data Format Migration Demo

This notebook demonstrates the practical implementation of data format migration and optimization techniques discussed in Chapter 3 of "Modern Data Engineering on AWS".

## Setup
First, let's import required libraries and initialize our environment.

In [None]:
import os
import sys
sys.path.append('../src')

from utils import (
    AWSManager, 
    DataGenerator, 
    PerformanceAnalyzer,
    load_config,
    setup_spark_session
)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('seaborn')
sns.set_palette("husl")

## 1. Initialize Environment
Load configuration and set up AWS resources

In [None]:
# Load configuration
config = load_config()

# Initialize AWS Manager
aws_manager = AWSManager()

# Create required buckets
aws_manager.create_buckets(config)

# Initialize Spark session
spark = setup_spark_session()

## 2. Generate Sample Data
Create realistic retail data for our migration demo

In [None]:
# Initialize data generator
data_generator = DataGenerator(config)

# Generate transaction data
transactions_df = data_generator.generate_transactions()
print("\nTransaction Data Sample:")
display(transactions_df.head())

# Generate customer data
customers_df = data_generator.generate_customers()
print("\nCustomer Data Sample:")
display(customers_df.head())

## 3. Initial Data Storage
Store data in original formats (CSV and JSON)

In [None]:
# Store transaction data as CSV
transactions_path = f"s3://{config['aws']['source_bucket']}/transactions/data.csv"
transactions_df.to_csv(transactions_path, index=False)

# Store customer data as JSON
customers_path = f"s3://{config['aws']['source_bucket']}/customers/data.json"
customers_df.to_json(customers_path, orient='records', lines=True)

print("Data stored in original formats")

## 4. Analyze Initial Storage Metrics
Measure storage usage and access patterns

In [None]:
# Initialize performance analyzer
performance_analyzer = PerformanceAnalyzer(spark)

# Analyze initial storage metrics
initial_metrics = {
    'transactions': performance_analyzer.analyze_storage_metrics(
        config['aws']['source_bucket'], 'transactions/'
    ),
    'customers': performance_analyzer.analyze_storage_metrics(
        config['aws']['source_bucket'], 'customers/'
    )
}

# Display initial metrics
pd.DataFrame(initial_metrics).T

## 5. Perform Format Migration
Migrate data to optimized formats (Parquet and Delta)

In [None]:
# Convert transactions to Delta format
spark_df = spark.createDataFrame(transactions_df)
delta_path = f"s3://{config['aws']['target_bucket']}/transactions_delta"

# Write as Delta format with partitioning
spark_df.write \
    .format("delta") \
    .partitionBy("category") \
    .mode("overwrite") \
    .save(delta_path)

# Convert customers to Parquet format
parquet_path = f"s3://{config['aws']['target_bucket']}/customers_parquet"
customers_df.to_parquet(
    parquet_path,
    partition_cols=['country'],
    compression='snappy'
)

## 6. Compare Performance
Measure and compare performance between original and optimized formats

In [None]:
# Measure storage after migration
optimized_metrics = {
    'transactions_delta': performance_analyzer.analyze_storage_metrics(
        config['aws']['target_bucket'], 'transactions_delta/'
    ),
    'customers_parquet': performance_analyzer.analyze_storage_metrics(
        config['aws']['target_bucket'], 'customers_parquet/'
    )
}

# Compare query performance
test_query = """
    SELECT category, 
           COUNT(*) as transaction_count, 
           SUM(amount) as total_amount
    FROM delta.`{}` 
    GROUP BY category
""".format(delta_path)

query_metrics = performance_analyzer.measure_query_performance(test_query)

# Display results
print("\nStorage Comparison:")
display(pd.DataFrame({
    'Original': initial_metrics,
    'Optimized': optimized_metrics
}))

print("\nQuery Performance:")
display(pd.DataFrame(query_metrics, index=[0]))

## 7. Visualize Results

In [None]:
# Create comparison visualizations
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Storage comparison
storage_data = pd.DataFrame({
    'Original': [initial_metrics['transactions']['total_size_gb'], 
                initial_metrics['customers']['total_size_gb']],
    'Optimized': [optimized_metrics['transactions_delta']['total_size_gb'],
                  optimized_metrics['customers_parquet']['total_size_gb']]
}, index=['Transactions', 'Customers'])

storage_data.plot(kind='bar', ax=ax1)
ax1.set_title('Storage Size Comparison (GB)')
ax1.set_ylabel('Size (GB)')

# Add percentage improvements
for i in range(len(storage_data)):
    pct_change = ((storage_data['Original'][i] - storage_data['Optimized'][i]) 
                  / storage_data['Original'][i] * 100)
    ax1.text(i, storage_data['Optimized'][i], 
             f'{pct_change:.1f}% reduction', 
             ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 8. Clean Up Resources
Remove created AWS resources

In [None]:
# Clean up buckets
aws_manager.clean_up_buckets(config)

# Stop Spark session
spark.stop()