# E-commerce Fraud Detection using SageMaker

This notebook demonstrates how to build a machine learning model to detect fraudulent e-commerce transactions. We'll use SageMaker to train and deploy the model, then test it on simulated transaction data.

In [None]:
pip install faker

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
import sagemaker
from sagemaker import Session
from sagemaker.serializers import CSVSerializer
import io
from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import os
import sys
import time
import json

sys.path.insert(0, os.path.abspath('./src/'))
from package import config

# Initialize AWS clients
session = sagemaker.Session()
s3 = boto3.resource('s3')
sm_client = boto3.client('sagemaker')

## Data Generation

Enhance the existing data generation with more fraud patterns:

In [None]:
def generate_enhanced_transaction_data(num_transactions=10000, fraud_ratio=0.1):
    """Generate enhanced transaction data with more sophisticated fraud patterns"""
    from package.generator import generate_transaction
    
    transactions = []
    
    # Generate legitimate transactions
    for _ in range(int(num_transactions * (1 - fraud_ratio))):
        transactions.append(generate_transaction(fraud_probability=0))
    
    # Generate fraudulent transactions with specific patterns
    for _ in range(int(num_transactions * fraud_ratio)):
        transaction = generate_transaction(fraud_probability=1)
        
        # Add more sophisticated fraud patterns
        if np.random.random() < 0.3:
            # Pattern 1: High value transactions from unusual locations using VPN
            transaction['amount'] = float(np.random.uniform(1000, 5000))
            transaction['location'] = np.random.choice(['Tokyo, Japan', 'Berlin, Germany', 'Sydney, Australia'])
            transaction['is_vpn'] = True
        elif np.random.random() < 0.5:
            # Pattern 2: Multiple small transactions using gift cards
            transaction['amount'] = float(np.random.uniform(50, 200))
            transaction['card_type'] = 'gift'
            transaction['device_type'] = 'mobile'
        else:
            # Pattern 3: Declined transactions attempted again
            transaction['status'] = 'declined'
            transaction['is_vpn'] = np.random.choice([True, False], p=[0.7, 0.3])
        
        transactions.append(transaction)
    
    # Convert to DataFrame
    df = pd.DataFrame(transactions)
    
    return df

# Generate transaction data with enhanced fraud patterns
num_transactions = 20000  # Increase dataset size
fraud_ratio = 0.1        # 10% fraud rate for better training
df = generate_enhanced_transaction_data(num_transactions, fraud_ratio)

# Display sample data
print(f"Generated {len(df)} transactions")
print(f"Fraud percentage: {df['is_vpn'].mean() * 100:.2f}%")
df.head()

## Enhanced Feature Engineering

Add more sophisticated feature engineering:

In [None]:
def engineer_features(df):
    """More sophisticated feature engineering for fraud detection"""
    
    # Create a copy to avoid modifying the original
    df_features = df.copy()
    
    # Extract time-based features
    df_features['timestamp'] = pd.to_datetime(df_features['timestamp'])
    df_features['hour_of_day'] = df_features['timestamp'].dt.hour
    df_features['day_of_week'] = df_features['timestamp'].dt.dayofweek
    df_features['is_weekend'] = df_features['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df_features['is_night'] = df_features['hour_of_day'].apply(lambda x: 1 if (x < 6 or x >= 22) else 0)
    
    # Location risk score
    location_risk = {
        'California, USA': 0.2,
        'New York, USA': 0.2,
        'Texas, USA': 0.2,
        'Florida, USA': 0.3,
        'Illinois, USA': 0.2,
        'London, UK': 0.3,
        'Paris, France': 0.4,
        'Berlin, Germany': 0.4,
        'Tokyo, Japan': 0.5,
        'Sydney, Australia': 0.5,
        'Unknown': 0.9
    }
    df_features['location_risk'] = df_features['location'].map(location_risk)
    
    # User behavior features
    # Group by user ID to analyze user patterns
    user_transaction_counts = df_features.groupby('user_id').size().reset_index(name='user_transaction_count')
    df_features = df_features.merge(user_transaction_counts, on='user_id', how='left')
    
    user_amount_avg = df_features.groupby('user_id')['amount'].mean().reset_index(name='user_avg_amount')
    df_features = df_features.merge(user_amount_avg, on='user_id', how='left')
    
    # Transaction amount z-score (compared to user's average)
    df_features['amount_zscore'] = df_features.apply(
        lambda row: (row['amount'] - row['user_avg_amount']) / df_features['amount'].std() 
        if row['user_transaction_count'] > 1 else 0, axis=1
    )
    
    # Device-location mismatch (unusual device for location)
    df_features['device_type_num'] = df_features['device_type'].map({
        'mobile': 0, 'desktop': 1, 'tablet': 2
    })
    
    # Create dummies for categorical variables
    df_features = pd.get_dummies(
        df_features, 
        columns=['device_type', 'card_type', 'status'], 
        drop_first=False
    )
    
    # Convert boolean to integer
    df_features['is_vpn'] = df_features['is_vpn'].astype(int)
    
    # Define a fraud label - this depends on how fraud is defined in your synthetic data
    # In this case, we'll use is_vpn + high amount + unusual location as indicators
    if 'is_fraud' not in df_features.columns:
        df_features['is_fraud'] = ((df_features['is_vpn'] == 1) & 
                                  (df_features['amount'] > 500) & 
                                  (df_features['location_risk'] > 0.3)).astype(int)
    
    # Compute additional risk factors
    df_features['transaction_risk_score'] = (
        df_features['amount_zscore'] * 0.3 +
        df_features['location_risk'] * 0.2 +
        df_features['is_vpn'] * 0.2 +
        df_features['is_night'] * 0.1
    )
    
    # Add status_declined if it exists
    if 'status_declined' in df_features.columns:
        df_features['transaction_risk_score'] += df_features['status_declined'] * 0.2
    
    return df_features

# Apply feature engineering
df_engineered = engineer_features(df)

# Display feature correlations with fraud
plt.figure(figsize=(12, 10))
correlation_matrix = df_engineered.corr()
fraud_correlations = correlation_matrix['is_fraud'].sort_values(ascending=False)
print("Top features correlated with fraud:")
print(fraud_correlations.head(10))

# Plot correlation heatmap
sns.heatmap(correlation_matrix.iloc[:15, :15], annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

## Model Training with Hyperparameter Tuning

In [None]:
# Select features for model training
features = ['amount', 'hour_of_day', 'day_of_week', 'is_weekend', 'is_night',
            'location_risk', 'is_vpn', 'user_transaction_count', 'amount_zscore',
            'transaction_risk_score']

# Add categorical columns (from one-hot encoding)
categorical_columns = [col for col in df_engineered.columns if 
                      col.startswith('device_type_') or 
                      col.startswith('card_type_') or 
                      col.startswith('status_')]
features.extend(categorical_columns)

# Define target
target = 'is_fraud'

# Split the data
X = df_engineered[features]
y = df_engineered[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Fraud ratio in training: {y_train.mean():.2f}")
print(f"Fraud ratio in testing: {y_test.mean():.2f}")

In [None]:
# Train using SageMaker built-in XGBoost algorithm
# First, prepare data in SVM light format required by SageMaker XGBoost
train_file = io.BytesIO()
dump_svmlight_file(X_train, y_train, train_file)
train_file.seek(0)

# Upload data to S3
bucket = config.MODEL_DATA_S3_BUCKET
prefix = 'fraud-classifier'
key = f'{prefix}/train/train.libsvm'

s3.Bucket(bucket).Object(key).upload_fileobj(train_file)
train_data_s3_uri = f's3://{bucket}/{key}'
print(f"Uploaded training data to {train_data_s3_uri}")

# Set output location
output_s3_uri = f's3://{bucket}/{prefix}/output'

In [None]:
# Prepare data in SVM light format required by SageMaker XGBoost
# First properly split into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create train file
train_file = io.BytesIO()
dump_svmlight_file(X_train, y_train, train_file)
train_file.seek(0)

# Create validation file
validation_file = io.BytesIO()
dump_svmlight_file(X_val, y_val, validation_file)
validation_file.seek(0)

# Upload data to S3
bucket = MODEL_DATA_S3_BUCKET
prefix = 'fraud-classifier'

# Upload train data
train_key = f'{prefix}/train/train.libsvm'
s3.Bucket(bucket).Object(train_key).upload_fileobj(train_file)
train_data_s3_uri = f's3://{bucket}/{train_key}'

# Upload validation data
val_key = f'{prefix}/validation/validation.libsvm'
s3.Bucket(bucket).Object(val_key).upload_fileobj(validation_file)
validation_data_s3_uri = f's3://{bucket}/{val_key}'

# Set output location
output_s3_uri = f's3://{bucket}/{prefix}/output'

print(f"Uploaded training data to {train_data_s3_uri}")
print(f"Uploaded validation data to {validation_data_s3_uri}")

# Get the XGBoost image - use the newer SageMaker API
from sagemaker import image_uris
container = image_uris.retrieve("xgboost", boto3.Session().region_name, version="1.0-1")

# Set up hyperparameter tuning job
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

# Define hyperparameter ranges
hyperparameter_ranges = {
    'max_depth': IntegerParameter(3, 10),
    'eta': ContinuousParameter(0.01, 0.3),
    'gamma': ContinuousParameter(0, 5),
    'min_child_weight': IntegerParameter(1, 10),
    'subsample': ContinuousParameter(0.5, 1.0),
    'colsample_bytree': ContinuousParameter(0.5, 1.0)
}

# Create an estimator with both train and validation channels
xgb = sagemaker.estimator.Estimator(
    container,
    role=SAGEMAKER_IAM_ROLE,
    train_instance_count=1,
    train_instance_type='ml.m5.xlarge',
    output_path=output_s3_uri,
    sagemaker_session=session,
    base_job_name='fraud-detection-xgb'
)

# Set static hyperparameters
xgb.set_hyperparameters(
    objective='binary:logistic',
    eval_metric='auc',
    num_round=100,
    rate_drop=0.1,
    scale_pos_weight=10  # Helpful for imbalanced datasets
)

# Create the tuner with the correct metric name
tuner = HyperparameterTuner(
    xgb,
    'validation:auc',  # Make sure this matches eval_metric
    hyperparameter_ranges,
    max_jobs=5,
    max_parallel_jobs=2,
    objective_type='Maximize'
)

# Start the hyperparameter tuning job with both train and validation
tuner.fit({
    'train': train_data_s3_uri,
    'validation': validation_data_s3_uri
})
print("Hyperparameter tuning job started")

## Model Deployment and Testing

In [None]:
# Get the best model from hyperparameter tuning
tuning_job_name = tuner.latest_tuning_job.job_name
best_job_name = sm_client.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)['BestTrainingJob']['TrainingJobName']

# Create model
model_name = f"fraud-detection-model-{int(time.time())}"
model_info = sm_client.create_model(
    ModelName=model_name,
    PrimaryContainer={
        'Image': container,
        'ModelDataUrl': f"{output_s3_uri}/{best_job_name}/output/model.tar.gz"
    },
    ExecutionRoleArn=config.SAGEMAKER_IAM_ROLE
)

In [None]:
# Create endpoint configuration with auto-scaling
endpoint_config_name = f"fraud-detection-config-{int(time.time())}"
endpoint_config = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[{
        'VariantName': 'default',
        'ModelName': model_name,
        'InitialInstanceCount': 1,
        'InstanceType': 'ml.t2.medium',
        'InitialVariantWeight': 1
    }]
)

# Create endpoint
endpoint_name = f"{config.SOLUTION_PREFIX}-xgb"
endpoint = sm_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name
)
print(f"Endpoint {endpoint_name} creation initiated")

# Wait for endpoint to become available
print("Waiting for endpoint to be in service...")
waiter = sm_client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=endpoint_name)
print(f"Endpoint {endpoint_name} is now in service")

In [None]:
# Create a predictor
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=session,
    serializer=CSVSerializer()
)

# Test on the test set
def format_features_for_prediction(row):
    """Format a row of features for prediction"""
    features_list = []
    for feature in features:
        features_list.append(str(row[feature]))
    return ','.join(features_list)

# Get predictions for test data
print("Getting predictions for test data...")
y_pred_proba = []
batch_size = 100  # Process in batches to avoid throttling

for i in range(0, len(X_test), batch_size):
    batch = X_test.iloc[i:i+batch_size]
    batch_features = [format_features_for_prediction(row) for _, row in batch.iterrows()]
    
    # Send each row separately to avoid CSV parsing issues
    batch_predictions = []
    for features_str in batch_features:
        response = predictor.predict(features_str)
        pred = float(response.decode('utf-8'))
        batch_predictions.append(pred)
    
    y_pred_proba.extend(batch_predictions)

# Convert probabilities to binary predictions
y_pred = [1 if p >= 0.5 else 0 for p in y_pred_proba]

In [None]:
# Evaluate the model
print("Model Evaluation:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")

# Save model details for reference
model_details = {
    'endpoint_name': endpoint_name,
    'model_name': model_name,
    'features': features,
    'performance': {
        'roc_auc': roc_auc,
        'confusion_matrix': cm.tolist()
    }
}

# Save to S3
model_details_key = f'{prefix}/model-details.json'
s3.Bucket(bucket).Object(model_details_key).put(
    Body=json.dumps(model_details, indent=2)
)
print(f"Model details saved to s3://{bucket}/{model_details_key}")

## Sample Transactions Generator for Testing

In [None]:
def generate_test_transactions(num_transactions=10, fraud_ratio=0.3):
    """Generate test transactions for demonstrating the model"""
    from package.generator import generate_transaction
    
    test_transactions = []
    
    # Generate mostly legitimate transactions
    for _ in range(int(num_transactions * (1 - fraud_ratio))):
        transaction = generate_transaction(fraud_probability=0)
        test_transactions.append(transaction)
    
    # Generate some fraudulent transactions
    for _ in range(int(num_transactions * fraud_ratio)):
        transaction = generate_transaction(fraud_probability=1)
        test_transactions.append(transaction)
    
    return test_transactions

# Generate test transactions
test_transactions = generate_test_transactions(20)

In [None]:
# Format transactions for prediction
def format_transaction_for_prediction(transaction):
    """Format a transaction for model prediction"""
    # Extract features from the transaction
    features_dict = {}
    
    # Basic features
    features_dict['amount'] = float(transaction.get('amount', 0))
    features_dict['is_vpn'] = 1 if transaction.get('is_vpn', False) else 0
    
    # Time-based features
    import datetime
    try:
        timestamp = datetime.datetime.strptime(
            transaction.get('timestamp', datetime.datetime.now().isoformat()),
            '%Y-%m-%dT%H:%M:%SZ'
        )
        features_dict['hour_of_day'] = timestamp.hour
        features_dict['day_of_week'] = timestamp.weekday()
        features_dict['is_weekend'] = 1 if timestamp.weekday() >= 5 else 0
        features_dict['is_night'] = 1 if (timestamp.hour < 6 or timestamp.hour >= 22) else 0
    except:
        features_dict['hour_of_day'] = 0
        features_dict['day_of_week'] = 0
        features_dict['is_weekend'] = 0
        features_dict['is_night'] = 0
    
    # Location risk
    location_risk = {
        'California, USA': 0.2,
        'New York, USA': 0.2,
        'Texas, USA': 0.2,
        'Florida, USA': 0.3,
        'Illinois, USA': 0.2,
        'London, UK': 0.3,
        'Paris, France': 0.4,
        'Berlin, Germany': 0.4,
        'Tokyo, Japan': 0.5,
        'Sydney, Australia': 0.5,
        'Unknown': 0.9
    }
    features_dict['location_risk'] = location_risk.get(transaction.get('location', 'Unknown'), 0.7)
    
    # Mock user behavior features (would come from historical data in production)
    features_dict['user_transaction_count'] = 5  # Dummy value
    features_dict['amount_zscore'] = (features_dict['amount'] - 200) / 100  # Dummy calculation
    
    # Transaction risk score
    features_dict['transaction_risk_score'] = (
        features_dict['amount_zscore'] * 0.3 +
        features_dict['location_risk'] * 0.2 +
        features_dict['is_vpn'] * 0.2 +
        features_dict['is_night'] * 0.1
    )
    
    # One-hot encoded categorical features
    device_type = transaction.get('device_type', 'unknown')
    features_dict[f'devicetype{device_type}'] = 1
    
    card_type = transaction.get('card_type', 'unknown')
    features_dict[f'card_type_{card_type}'] = 1
    
    status = transaction.get('status', 'approved')
    features_dict[f'status_{status}'] = 1
    
    # Assemble feature string in the correct order
    feature_values = []
    for feature in features:
        feature_values.append(str(features_dict.get(feature, 0)))
    
    return ','.join(feature_values)

# Test the transactions against the model
print("Testing transactions against the model...")
for transaction in test_transactions:
    features_str = format_transaction_for_prediction(transaction)
    
    # Get prediction
    response = predictor.predict(features_str)
    fraud_probability = float(response.decode('utf-8'))
    is_fraud = fraud_probability >= 0.5
    
    print(f"Transaction {transaction['transaction_id']}:")
    print(f"Amount: ${transaction['amount']:.2f}, Location: {transaction['location']}")
    print(f"Device: {transaction['device_type']}, VPN: {transaction['is_vpn']}")
    print(f"Prediction: {'FRAUD' if is_fraud else 'LEGITIMATE'} (probability: {fraud_probability:.4f})")
    print("-" * 60)

## Kinesis Integration for Real-time Processing

In [None]:
def send_transactions_to_kinesis(transactions, stream_name=config.KINESIS_STREAM_NAME):
    """Send transactions to Kinesis for real-time processing"""
    kinesis = boto3.client('kinesis')
    
    for transaction in transactions:
        # Convert transaction to JSON
        transaction_json = json.dumps(transaction)
        
        # Send to Kinesis
        response = kinesis.put_record(
            StreamName=stream_name,
            Data=transaction_json,
            PartitionKey=transaction['transaction_id']
        )
        
        print(f"Sent transaction {transaction['transaction_id']} to Kinesis: Shard {response['ShardId']}")
        
        # Small delay to avoid throttling
        time.sleep(0.1)

# Generate a batch of transactions and send to Kinesis
kinesis_test_transactions = generate_test_transactions(10, fraud_ratio=0.2)
send_transactions_to_kinesis(kinesis_test_transactions)

print(f"Sent {len(kinesis_test_transactions)} transactions to Kinesis stream {config.KINESIS_STREAM_NAME}")
print("These will be processed by the Lambda function and results stored in DynamoDB.")

## Dashboard for Monitoring

In [None]:
def check_dynamodb_for_results(transaction_ids, table_name=config.DYNAMODB_TABLE):
    """Check DynamoDB for processing results of test transactions"""
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table(table_name)
    
    results = []
    
    for tx_id in transaction_ids:
        try:
            response = table.get_item(Key={'transaction_id': tx_id})
            if 'Item' in response:
                results.append(response['Item'])
            else:
                print(f"Transaction {tx_id} not found in DynamoDB yet.")
        except Exception as e:
            print(f"Error retrieving transaction {tx_id}: {str(e)}")
    
    return results

# Give Lambda some time to process
print("Waiting for Lambda to process transactions...")
time.sleep(10)  # Wait 10 seconds

# Check DynamoDB for results
tx_ids = [tx['transaction_id'] for tx in kinesis_test_transactions]
results = check_dynamodb_for_results(tx_ids)

# Display results
if results:
    results_df = pd.DataFrame(results)
    print(f"Found {len(results)} transactions in DynamoDB")
    
    # Plot fraud probabilities
    plt.figure(figsize=(10, 6))
    sns.barplot(x='transaction_id', y='fraud_probability', data=results_df)
    plt.title('Fraud Probabilities for Test Transactions')
    plt.xlabel('Transaction ID')
    plt.ylabel('Fraud Probability')
    plt.xticks(rotation=45)
    plt.axhline(y=0.5, color='r', linestyle='--')
    plt.tight_layout()
    plt.show()
else:
    print("No results found in DynamoDB yet. Lambda might still be processing or there might be an issue.")

## Model Explanation and Monitoring

In [None]:
def explain_fraud_prediction(transaction, fraud_probability):
    """Explain why a transaction is flagged as fraudulent"""
    risk_factors = []
    
    # Check for high amount
    if transaction['amount'] > 500:
        risk_factors.append(f"High transaction amount: ${transaction['amount']:.2f}")
    
    # Check for VPN usage
    if transaction.get('is_vpn', False):
        risk_factors.append("VPN usage detected")
    
    # Check for unusual location
    location = transaction.get('location', '')
    if location in ['Tokyo, Japan', 'Berlin, Germany', 'Paris, France', 'Sydney, Australia']:
        risk_factors.append(f"Transaction from unusual location: {location}")
    
    # Check for gift card
    if transaction.get('card_type', '') == 'gift':
        risk_factors.append("Gift card usage")
    
    # Check for mobile device
    if transaction.get('device_type', '') == 'mobile':
        risk_factors.append("Mobile device used")
    
    # Check for declined status
    if transaction.get('status', '') == 'declined':
        risk_factors.append("Transaction was initially declined")
    
    # Check for night-time transaction
    try:
        timestamp = datetime.datetime.strptime(
            transaction.get('timestamp', ''),
            '%Y-%m-%dT%H:%M:%SZ'
        )
        hour = timestamp.hour
        if hour < 6 or hour >= 22:
            risk_factors.append(f"Transaction occurred during night hours: {hour}:00")
    except:
        pass
    
    return {
        'transaction_id': transaction['transaction_id'],
        'fraud_probability': fraud_probability,
        'risk_score': fraud_probability,
        'is_fraud': fraud_probability > 0.5,
        'risk_factors': risk_factors,
        'explanation': f"Transaction flagged with {fraud_probability:.1%} fraud probability due to {len(risk_factors)} risk factors."
    }

# Demonstrate explanation for a high-risk transaction
high_risk_transactions = [tx for tx in test_transactions if tx.get('is_vpn', False) and tx.get('amount', 0) > 500]
if high_risk_transactions:
    sample_tx = high_risk_transactions[0]
    features_str = format_transaction_for_prediction(sample_tx)
    
    # Get prediction
    response = predictor.predict(features_str)
    fraud_probability = float(response.decode('utf-8'))
    
    # Get explanation
    explanation = explain_fraud_prediction(sample_tx, fraud_probability)
    
    print("Sample Fraud Explanation:")
    print(f"Transaction: {sample_tx['transaction_id']}")
    print(f"Amount: ${sample_tx['amount']:.2f}, Location: {sample_tx['location']}")
    print(f"Fraud Probability: {fraud_probability:.2%}")
    print("Risk Factors:")
    for factor in explanation['risk_factors']:
        print(f"- {factor}")
    print(f"Explanation: {explanation['explanation']}")

## Summary and Next Steps

In [None]:
# Summary of the Fraud Detection Pipeline
print("""
# Fraud Detection Pipeline Summary

This notebook has established a complete fraud detection pipeline including:

1. **Data Generation**: Created realistic transaction data with fraud patterns
2. **Feature Engineering**: Transformed raw transaction data into predictive features
3. **Model Training**: Trained and tuned an XGBoost model for fraud detection
4. **Model Deployment**: Deployed the model to a SageMaker endpoint for real-time inference
5. **Real-time Processing**: Connected to Kinesis for real-time transaction processing
6. **Data Storage**: Stored processed transactions in DynamoDB
7. **Monitoring**: Created visualizations for model performance and transaction processing

## Production Considerations:

1. **Monitoring**: Implement CloudWatch dashboards to monitor endpoint performance and fraud rates
2. **Retraining**: Set up automatic retraining as new transaction data becomes available
3. **Alerting**: Enhance the SNS notification system for high-value fraudulent transactions
4. **Scaling**: Configure auto-scaling for the endpoint as transaction volume increases
5. **Cost Optimization**: Move to SageMaker serverless inference for cost efficiency

## Security Considerations:

1. **Data Encryption**: Ensure all data is encrypted at rest and in transit
2. **Access Control**: Implement strict IAM policies for accessing sensitive data
3. **Audit Logging**: Enable comprehensive logging of all fraud detection activities
4. **PII Handling**: Establish proper handling of personally identifiable information
""")