# E-commerce Fraud Detection using SageMaker

This notebook demonstrates how to build a machine learning model to detect fraudulent e-commerce transactions. We'll use SageMaker to train and deploy the model, then test it on simulated transaction data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
import sagemaker
from sagemaker import Session
from sagemaker.serializers import CSVSerializer
import io
from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import os
import sys
import time
import json

sys.path.insert(0, os.path.abspath('./src/'))
from package import config

# Initialize AWS clients
session = sagemaker.Session()
s3 = boto3.resource('s3')
sm_client = boto3.client('sagemaker')

In [None]:
# Import config variables
import sys
import os
sys.path.insert(0, os.path.abspath('./src/'))
from package import config

import json

# Load the stack outputs from the file
with open('/home/ec2-user/SageMaker/stack_outputs_processed.json', 'r') as f:
    stack_outputs = json.load(f)

SAGEMAKER_IAM_ROLE = stack_outputs.get('IamRole')
print(f"Using IAM Role: {SAGEMAKER_IAM_ROLE}")
bucket = stack_outputs.get('ModelDataBucket')
print(f"bucekt: {bucket}")
SOLUTION_PREFIX = stack_outputs.get('SolutionPrefix')



# Get other variables you need


## Data Generation

Enhance the existing data generation with more fraud patterns:

In [None]:
import pandas as pd
import boto3
import json
import io
import os
import logging

# Configure logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def load_transaction_data_from_s3(bucket_name=None, prefix='historical-data', filename='transactions.json'):
    # Get bucket name from environment variable if not provided
    bucket_name = bucket
    
    # Initialize S3 client
    s3_client = boto3.client('s3')
    
    # Full S3 key
    s3_key = f"{prefix}/{filename}"
    
    try:
        logger.info(f"Loading transaction data from s3://{bucket_name}/{s3_key}")
        
        # Get object from S3
        response = s3_client.get_object(Bucket=bucket_name, Key=s3_key)
        
        # Read JSON content
        json_content = response['Body'].read().decode('utf-8')
        transactions = json.loads(json_content)
        
        # Convert to DataFrame
        df = pd.DataFrame(transactions)
        
        logger.info(f"Successfully loaded {len(df)} transactions from S3")
        logger.info(f"Fraud percentage: {df['is_fraud'].mean() * 100:.2f}%")
        
        return df
    
    except Exception as e:
        logger.error(f"Error loading transaction data from S3: {str(e)}")
        raise

# Example usage:
df = load_transaction_data_from_s3()
df.head()

# To use in a SageMaker notebook:
def load_data_for_training(bucket_name=None):
    """
    Load transaction data from S3 and prepare it for model training
    """
    # Load data from S3
    df = load_transaction_data_from_s3(bucket_name)
    
    # Display info about the loaded data
    print(f"Loaded {len(df)} transactions from S3")
    print(f"Fraud percentage: {df['is_fraud'].mean() * 100:.2f}%")
    
    # Return the DataFrame
    return df


## Enhanced Feature Engineering

Add more sophisticated feature engineering:

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def engineer_features_for_ml(df):
    """
    Feature engineering for ML-based fraud detection.
    Uses the ground truth fraud labels from the data rather than creating rule-based labels.
    """
    # Create a copy to avoid modifying the original
    df_features = df.copy()
    
    # Ensure we have a fraud label column from the data
    if 'is_fraud' not in df_features.columns:
        raise ValueError("Data must contain an 'is_fraud' column with ground truth labels")
    
    # Convert timestamp to datetime if it's not already
    if not pd.api.types.is_datetime64_dtype(df_features['timestamp']):
        df_features['timestamp'] = pd.to_datetime(df_features['timestamp'])
    
    # Extract time-based features
    df_features['hour_of_day'] = df_features['timestamp'].dt.hour
    df_features['day_of_week'] = df_features['timestamp'].dt.dayofweek
    df_features['month'] = df_features['timestamp'].dt.month
    df_features['day_of_month'] = df_features['timestamp'].dt.day
    df_features['is_weekend'] = df_features['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df_features['is_night'] = df_features['hour_of_day'].apply(lambda x: 1 if (x < 6 or x >= 22) else 0)
    
    # User behavior features - calculated properly for both training and prediction
    # Group by user_id to get transaction counts and statistics
    user_stats = df_features.groupby('user_id').agg({
        'transaction_id': 'count',
        'amount': ['mean', 'std', 'min', 'max'],
        'is_vpn': 'mean'
    })
    
    # Flatten multi-level columns
    user_stats.columns = ['_'.join(col).strip() for col in user_stats.columns.values]
    user_stats.rename(columns={
        'transaction_id_count': 'user_transaction_count',
        'amount_mean': 'user_avg_amount',
        'amount_std': 'user_amount_std',
        'amount_min': 'user_min_amount',
        'amount_max': 'user_max_amount',
        'is_vpn_mean': 'user_vpn_ratio'
    }, inplace=True)
    
    # Handle users with only one transaction (no std)
    user_stats['user_amount_std'].fillna(0, inplace=True)
    
    # Reset index to make user_id a column again
    user_stats.reset_index(inplace=True)
    
    # Merge user statistics back to the main dataframe
    df_features = df_features.merge(user_stats, on='user_id', how='left')
    
    # Calculate transaction amount z-score relative to user's history
    # Use a safe calculation that handles division by zero
    df_features['amount_zscore'] = df_features.apply(
        lambda row: (row['amount'] - row['user_avg_amount']) / (row['user_amount_std'] if row['user_amount_std'] > 0 else 1),
        axis=1
    )
    
    # Create features for relative transaction size
    df_features['amount_to_max_ratio'] = df_features.apply(
        lambda row: row['amount'] / row['user_max_amount'] if row['user_max_amount'] > 0 else 0,
        axis=1
    )
    
    # Location features - use past fraud rates instead of hardcoded risk
    location_fraud_rates = df_features.groupby('location')['is_fraud'].mean().to_dict()
    df_features['location_fraud_rate'] = df_features['location'].map(location_fraud_rates)
    
    # Handle NaN values for new locations
    df_features['location_fraud_rate'].fillna(df_features['is_fraud'].mean(), inplace=True)
    
    # Device and card type features
    device_fraud_rates = df_features.groupby('device_type')['is_fraud'].mean().to_dict()
    df_features['device_fraud_rate'] = df_features['device_type'].map(device_fraud_rates)
    
    card_fraud_rates = df_features.groupby('card_type')['is_fraud'].mean().to_dict()
    df_features['card_fraud_rate'] = df_features['card_type'].map(card_fraud_rates)
    
    # Convert boolean to integer if needed
    if pd.api.types.is_bool_dtype(df_features['is_vpn']):
        df_features['is_vpn'] = df_features['is_vpn'].astype(int)
    
    # Create dummy variables for categorical features
    categorical_features = ['device_type', 'card_type', 'status', 'location']
    df_features = pd.get_dummies(df_features, columns=categorical_features, drop_first=False)
    
    # New feature: is this amount unusual for this user?
    df_features['is_unusual_amount'] = (
        (df_features['amount'] > (df_features['user_avg_amount'] + 2 * df_features['user_amount_std'])) |
        (df_features['amount'] < (df_features['user_avg_amount'] - 2 * df_features['user_amount_std']))
    ).astype(int)
    
    # New feature: is this a new user (fewer than N transactions)?
    df_features['is_new_user'] = (df_features['user_transaction_count'] <= 3).astype(int)
    
    # New feature: is this the user's largest transaction?
    df_features['is_largest_tx'] = (df_features['amount'] >= df_features['user_max_amount'] * 0.95).astype(int)
    
    return df_features

def create_ml_preprocessing_pipeline(df_features):
    """
    Create a scikit-learn preprocessing pipeline for ML model training.
    This ensures consistent feature transformation for training and prediction.
    """
    # List your columns by type
    numeric_features = [
        'amount', 'hour_of_day', 'day_of_week', 'month', 'day_of_month', 
        'is_weekend', 'is_night', 'is_vpn', 'user_transaction_count',
        'user_avg_amount', 'user_amount_std', 'user_min_amount', 'user_max_amount',
        'user_vpn_ratio', 'amount_zscore', 'amount_to_max_ratio', 
        'location_fraud_rate', 'device_fraud_rate', 'card_fraud_rate',
        'is_unusual_amount', 'is_new_user', 'is_largest_tx'
    ]
    
    # Create column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
        ],
        remainder='passthrough'  # This keeps the dummy variables without scaling
    )
    
    return preprocessor

def train_test_split_with_features(df):
    """
    Load data, engineer features, and prepare train/test splits with proper feature preprocessing
    """
    # Apply feature engineering
    df_features = engineer_features_for_ml(df)
    
    # Display feature correlations with fraud
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    plt.figure(figsize=(12, 10))
    # Calculate correlation only for numeric columns
    numeric_df = df_features.select_dtypes(include=['number'])
    correlation_matrix = numeric_df.corr()
    fraud_correlations = correlation_matrix['is_fraud'].sort_values(ascending=False)
    print("Top features correlated with fraud:")
    print(fraud_correlations.head(15))

    # Plot correlation heatmap for top correlated features
    top_corr_features = fraud_correlations.index[:15]
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        correlation_matrix.loc[top_corr_features, top_corr_features], 
        annot=True, 
        cmap='coolwarm'
    )
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Return the engineered features dataframe
    return df_features

## Feature Engineering


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Before preprocessing, check for duplicate columns


def engineer_features_for_ml(df):
    """
    Feature engineering for ML-based fraud detection.
    Uses the ground truth fraud labels from the data rather than creating rule-based labels.
    """
    # Create a copy to avoid modifying the original
    df_features = df.copy()
    
    # Ensure we have a fraud label column from the data
    if 'is_fraud' not in df_features.columns:
        raise ValueError("Data must contain an 'is_fraud' column with ground truth labels")
    
    # Convert timestamp to datetime if it's not already
    if not pd.api.types.is_datetime64_dtype(df_features['timestamp']):
        df_features['timestamp'] = pd.to_datetime(df_features['timestamp'])
    
    # Extract time-based features
    df_features['hour_of_day'] = df_features['timestamp'].dt.hour
    df_features['day_of_week'] = df_features['timestamp'].dt.dayofweek
    df_features['month'] = df_features['timestamp'].dt.month
    df_features['day_of_month'] = df_features['timestamp'].dt.day
    df_features['is_weekend'] = df_features['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df_features['is_night'] = df_features['hour_of_day'].apply(lambda x: 1 if (x < 6 or x >= 22) else 0)
    
    # User behavior features - calculated properly for both training and prediction
    # Group by user_id to get transaction counts and statistics
    user_stats = df_features.groupby('user_id').agg({
        'transaction_id': 'count',
        'amount': ['mean', 'std', 'max'],
        'is_vpn': 'mean'
    })
    
    # Flatten multi-level columns
    user_stats.columns = ['_'.join(col).strip() for col in user_stats.columns.values]
    user_stats.rename(columns={
        'transaction_id_count': 'user_transaction_count',
        'amount_mean': 'user_avg_amount',
        'amount_std': 'user_amount_std',
        'amount_max': 'user_max_amount',
        'is_vpn_mean': 'user_vpn_ratio'
    }, inplace=True)
    
    # Handle users with only one transaction (no std)
    user_stats['user_amount_std'].fillna(0, inplace=True)
    
    # Reset index to make user_id a column again
    user_stats.reset_index(inplace=True)
    
    # Merge user statistics back to the main dataframe
    df_features = df_features.merge(user_stats, on='user_id', how='left')
    
    # Calculate transaction amount z-score relative to user's history
    # Use a safe calculation that handles division by zero
    df_features['amount_zscore'] = df_features.apply(
        lambda row: (row['amount'] - row['user_avg_amount']) / (row['user_amount_std'] if row['user_amount_std'] > 0 else 1),
        axis=1
    )
    
    # Create features for relative transaction size
    df_features['amount_to_max_ratio'] = df_features.apply(
        lambda row: row['amount'] / row['user_max_amount'] if row['user_max_amount'] > 0 else 0,
        axis=1
    )
    
    # Location features - use past fraud rates instead of hardcoded risk
    location_fraud_rates = df_features.groupby('location')['is_fraud'].mean().to_dict()
    df_features['location_fraud_rate'] = df_features['location'].map(location_fraud_rates)
    
    # Handle NaN values for new locations
    df_features['location_fraud_rate'].fillna(df_features['is_fraud'].mean(), inplace=True)
    
    # Device and card type features
    device_fraud_rates = df_features.groupby('device_type')['is_fraud'].mean().to_dict()
    df_features['device_fraud_rate'] = df_features['device_type'].map(device_fraud_rates)
    
    card_fraud_rates = df_features.groupby('card_type')['is_fraud'].mean().to_dict()
    df_features['card_fraud_rate'] = df_features['card_type'].map(card_fraud_rates)
    
    # Convert boolean to integer if needed
    if pd.api.types.is_bool_dtype(df_features['is_vpn']):
        df_features['is_vpn'] = df_features['is_vpn'].astype(int)
    
    # Create dummy variables for categorical features
    categorical_features = ['device_type', 'card_type', 'status', 'location']
    df_features = pd.get_dummies(df_features, columns=categorical_features, drop_first=False)
    
    # New feature: is this amount unusual for this user?
    df_features['is_unusual_amount'] = (
        (df_features['amount'] > (df_features['user_avg_amount'] + 2 * df_features['user_amount_std'])) |
        (df_features['amount'] < (df_features['user_avg_amount'] - 2 * df_features['user_amount_std']))
    ).astype(int)
    
    # New feature: is this a new user (fewer than N transactions)?
    df_features['is_new_user'] = (df_features['user_transaction_count'] <= 3).astype(int)
    
    # New feature: is this the user's largest transaction?
    df_features['is_largest_tx'] = (df_features['amount'] >= df_features['user_max_amount'] * 0.95).astype(int)
    
    # Check for duplicate columns
    if df_features.columns.duplicated().any():
        print("Warning: Found duplicate columns after feature engineering.")
        # Get the duplicated column names
        duplicated_cols = df_features.columns[df_features.columns.duplicated()].tolist()
        print(f"Duplicate columns: {duplicated_cols}")
        
        # Drop duplicates
        df_features = df_features.loc[:, ~df_features.columns.duplicated()]
        print(f"Removed duplicate columns. New shape: {df_features.shape}")
    
    return df_features


# Fix for the create_ml_preprocessing_pipeline function
def create_ml_preprocessing_pipeline(df_features, X=None):
    """
    Create a scikit-learn preprocessing pipeline for ML model training.
    This ensures consistent feature transformation for training and prediction.
    
    Parameters:
    df_features: DataFrame with all features
    X: The actual X_train or X data to be processed (optional)
    """
    # Define numeric features to use
    numeric_features = [
        'amount', 'hour_of_day', 'day_of_week', 'month', 'day_of_month', 
        'is_weekend', 'is_night', 'is_vpn', 'user_transaction_count',
        'user_avg_amount', 'user_amount_std', 'amount_zscore', 
        'amount_to_max_ratio', 'location_fraud_rate', 'device_fraud_rate', 
        'card_fraud_rate', 'is_unusual_amount', 'is_new_user', 'is_largest_tx'
    ]
    
    # If X is provided, check for duplicates in X
    if X is not None:
        # Check for duplicate columns in X
        if X.columns.duplicated().any():
            dupe_cols = X.columns[X.columns.duplicated()].tolist()
            print(f"Warning: Found duplicate columns in X: {dupe_cols}")
            
            # Find first occurrence of each column name
            keep_cols = ~X.columns.duplicated(keep='first')
            X_unique = X.loc[:, keep_cols]
            print(f"Removed duplicate columns. X shape before: {X.shape}, after: {X_unique.shape}")
            
            # Update the provided X dataframe in-place with unique columns
            # Note: This is modifying the input dataframe
            # This is a bit of a hack but should work for your case
            X = X_unique
    
    # Ensure we only use features that exist in the dataframe
    available_numeric_features = [col for col in numeric_features if col in df_features.columns]
    
    print(f"Using these numeric features for preprocessing: {available_numeric_features}")
    
    # Create column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), available_numeric_features),
        ],
        remainder='passthrough'  # This keeps the dummy variables without scaling
    )
    
    return preprocessor

def train_test_split_with_features(df):
    """
    Load data, engineer features, and prepare train/test splits with proper feature preprocessing
    """
    # Apply feature engineering
    df_features = engineer_features_for_ml(df)
    
    # Define numeric features to use for the model
    numeric_features = [
        'amount', 'hour_of_day', 'day_of_week', 'month', 'day_of_month',
        'is_weekend', 'is_night', 'is_vpn', 'user_transaction_count',
        'user_avg_amount', 'user_amount_std', 'amount_zscore', 
        'amount_to_max_ratio', 'location_fraud_rate', 'device_fraud_rate', 
        'card_fraud_rate', 'is_unusual_amount', 'is_new_user', 'is_largest_tx'
    ]
    
    # Get all one-hot encoded categorical columns
    categorical_columns = [col for col in df_features.columns if 
                          col.startswith('device_type_') or 
                          col.startswith('card_type_') or 
                          col.startswith('status_') or
                          col.startswith('location_')]
    
    # Combine all features
    features = numeric_features + categorical_columns
    
    # Define target
    target = 'is_fraud'
    
    # Check if all features exist in the dataframe
    missing_features = [f for f in features if f not in df_features.columns]
    if missing_features:
        print(f"Warning: These features are missing from the dataframe: {missing_features}")
        # Keep only features that exist in the dataframe
        features = [f for f in features if f in df_features.columns]
    
    # Check for duplicate columns in the dataframe
    duplicates = df_features.columns[df_features.columns.duplicated()].tolist()
    if duplicates:
        print(f"Warning: Found duplicate columns: {duplicates}")
        # Drop duplicate columns
        df_features = df_features.loc[:, ~df_features.columns.duplicated()]
        print(f"Removed duplicate columns. New shape: {df_features.shape}")
    
    # Split the data
    X = df_features[features]
    y = df_features[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    print(f"Training set shape: {X_train.shape}")
    print(f"Testing set shape: {X_test.shape}")
    print(f"Fraud ratio in training: {y_train.mean():.2f}")
    print(f"Fraud ratio in testing: {y_test.mean():.2f}")
    
    return df_features, X_train, X_test, y_train, y_test



In [None]:
## Model Training with Hyperparameter Tuning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Load data
df = load_data_for_training()

# Apply feature engineering only once
df_engineered = engineer_features_for_ml(df)

# Define features
numeric_features = [
    'amount', 'hour_of_day', 'day_of_week', 'month', 'day_of_month',
    'is_weekend', 'is_night', 'is_vpn', 'user_transaction_count',
    'user_avg_amount', 'user_amount_std', 'amount_zscore', 
    'amount_to_max_ratio', 'location_fraud_rate', 'device_fraud_rate', 
    'card_fraud_rate', 'is_unusual_amount', 'is_new_user', 'is_largest_tx'
]

# Get categorical columns
categorical_columns = [col for col in df_engineered.columns if 
                      col.startswith('device_type_') or 
                      col.startswith('card_type_') or 
                      col.startswith('status_') or
                      col.startswith('location_')]

# Combine all features
features = numeric_features + categorical_columns

# Define target
target = 'is_fraud'

# Check for duplicated columns in the engineered dataframe
if df_engineered.columns.duplicated().any():
    print("Warning: Found duplicate columns in engineered dataframe.")
    dupe_cols = df_engineered.columns[df_engineered.columns.duplicated()].tolist()
    print(f"Duplicate columns: {dupe_cols}")
    
    # Remove duplicates, keeping only the first occurrence
    df_engineered = df_engineered.loc[:, ~df_engineered.columns.duplicated()]
    print(f"Removed duplicate columns. Shape: {df_engineered.shape}")

# Remove features that don't exist
features = [f for f in features if f in df_engineered.columns]

# Split the data
X = df_engineered[features]
y = df_engineered[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Fraud ratio in training: {y_train.mean():.2f}")
print(f"Fraud ratio in testing: {y_test.mean():.2f}")

# Create preprocessing pipeline, passing both the feature dataframe and X_train
preprocessor = create_ml_preprocessing_pipeline(df_engineered, X_train)

# Remove duplicate columns from X_train and X_test if they exist
if X_train.columns.duplicated().any():
    X_train = X_train.loc[:, ~X_train.columns.duplicated()]
    print(f"Removed duplicate columns from X_train. New shape: {X_train.shape}")

if X_test.columns.duplicated().any():
    X_test = X_test.loc[:, ~X_test.columns.duplicated()]
    print(f"Removed duplicate columns from X_test. New shape: {X_test.shape}")

# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Processed training data shape: {X_train_processed.shape}")

In [None]:
import io
import boto3
import os
from sklearn.datasets import dump_svmlight_file

def upload_training_data_to_s3(X_train, y_train, X_val=None, y_val=None):
    """
    """
    # Get S3 bucket from config
    s3 = boto3.resource('s3')
    bucket = bucket
    print(bucket)
    prefix = 'fraud-classifier'
    
    # Create a buffer for the training data in SVM light format
    train_file = io.BytesIO()
    dump_svmlight_file(X_train, y_train, train_file)
    train_file.seek(0)
    
    # Upload training data to S3
    train_key = f'{prefix}/train/train.libsvm'
    s3.Bucket(bucket).Object(train_key).upload_fileobj(train_file)
    train_data_s3_uri = f's3://{bucket}/{train_key}'
    print(f"Uploaded training data to {train_data_s3_uri}")
    
    # If validation data is provided, upload it too
    val_data_s3_uri = None
    if X_val is not None and y_val is not None:
        val_file = io.BytesIO()
        dump_svmlight_file(X_val, y_val, val_file)
        val_file.seek(0)
        
        val_key = f'{prefix}/validation/validation.libsvm'
        s3.Bucket(bucket).Object(val_key).upload_fileobj(val_file)
        val_data_s3_uri = f's3://{bucket}/{val_key}'
        print(f"Uploaded validation data to {val_data_s3_uri}")
    
    # Set output location
    output_s3_uri = f's3://{bucket}/{prefix}/output'
    
    return {
        'train_data_s3_uri': train_data_s3_uri,
        'validation_data_s3_uri': val_data_s3_uri,
        'output_s3_uri': output_s3_uri
    }


In [None]:
import io
import boto3
import os
import sagemaker
from sagemaker import Session
from sagemaker import image_uris
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import train_test_split

# Initialize S3 resource
s3 = boto3.resource('s3')
session = sagemaker.Session()

# Create a further split of training data to get a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create train file in SVMlight format
train_file = io.BytesIO()
dump_svmlight_file(X_train, y_train, train_file)
train_file.seek(0)

# Create validation file in SVMlight format
validation_file = io.BytesIO()
dump_svmlight_file(X_val, y_val, validation_file)
validation_file.seek(0)

# Get the bucket name from environment or config
bucket = bucket
prefix = 'fraud-classifier'

# Upload train data
train_key = f'{prefix}/train/train.libsvm'
s3.Bucket(bucket).Object(train_key).upload_fileobj(train_file)
train_data_s3_uri = f's3://{bucket}/{train_key}'

# Upload validation data
val_key = f'{prefix}/validation/validation.libsvm'
s3.Bucket(bucket).Object(val_key).upload_fileobj(validation_file)
validation_data_s3_uri = f's3://{bucket}/{val_key}'

# Set output location
output_s3_uri = f's3://{bucket}/{prefix}/output'

print(f"Uploaded training data to {train_data_s3_uri}")
print(f"Uploaded validation data to {validation_data_s3_uri}")

# Get the XGBoost image - use the newer SageMaker API
container = image_uris.retrieve("xgboost", boto3.Session().region_name, version="1.0-1")

# Define hyperparameter ranges
hyperparameter_ranges = {
    'max_depth': IntegerParameter(3, 10),
    'eta': ContinuousParameter(0.01, 0.3),
    'gamma': ContinuousParameter(0, 5),
    'min_child_weight': IntegerParameter(1, 10),
    'subsample': ContinuousParameter(0.5, 1.0),
    'colsample_bytree': ContinuousParameter(0.5, 1.0)
}

# Create an estimator with both train and validation channels
xgb = sagemaker.estimator.Estimator(
    container,
    role=SAGEMAKER_IAM_ROLE,
    train_instance_count=1,
    train_instance_type='ml.m5.xlarge',
    output_path=output_s3_uri,
    sagemaker_session=session,
    base_job_name='fraud-detection-xgb'
)

# Set static hyperparameters
xgb.set_hyperparameters(
    objective='binary:logistic',
    eval_metric='auc',
    num_round=100,
    rate_drop=0.1,
    scale_pos_weight=10,  # Helpful for imbalanced datasets
    # Add the following for better handling of missing values and numerical stability
    tree_method='auto',
    max_delta_step=3,    # Helpful for unbalanced classes
    early_stopping_rounds=10
)

# Create the tuner with the correct metric name
tuner = HyperparameterTuner(
    xgb,
    'validation:auc',  # Make sure this matches eval_metric
    hyperparameter_ranges,
    max_jobs=5,
    max_parallel_jobs=2,
    objective_type='Maximize'
)

# Start the hyperparameter tuning job with both train and validation
tuner.fit({
    'train': train_data_s3_uri,
    'validation': validation_data_s3_uri
})
print("Hyperparameter tuning job started")

# Save feature information for later use with the model
feature_info = {
    'feature_names': list(X_train.columns),
    'categorical_features': [col for col in X_train.columns if 
                             col.startswith('device_type_') or 
                             col.startswith('card_type_') or 
                             col.startswith('status_') or
                             col.startswith('location_')],
    'numeric_features': [col for col in X_train.columns if 
                         not (col.startswith('device_type_') or 
                              col.startswith('card_type_') or 
                              col.startswith('status_') or
                              col.startswith('location_'))]
}

# Store feature information in S3 for deployment
feature_info_key = f'{prefix}/model/feature_info.json'
s3.Bucket(bucket).Object(feature_info_key).put(
    Body=json.dumps(feature_info, indent=2)
)
print(f"Saved feature information to s3://{bucket}/{feature_info_key}")



In [None]:
## Model Deployment and Testing

In [None]:
import time
import boto3
import json
import os

# Initialize SageMaker client
sm_client = boto3.client('sagemaker')

# Get the best model from hyperparameter tuning
tuning_job_name = tuner.latest_tuning_job.job_name
best_job_name = sm_client.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)['BestTrainingJob']['TrainingJobName']

print(f"Best training job: {best_job_name}")

# Get the best hyperparameters
best_hyperparameters = sm_client.describe_training_job(
    TrainingJobName=best_job_name
)['HyperParameters']

print("Best hyperparameters:")
for param, value in best_hyperparameters.items():
    print(f"  {param}: {value}")

# Create model
model_name = f"fraud-detection-model-{int(time.time())}"
model_info = sm_client.create_model(
    ModelName=model_name,
    PrimaryContainer={
        'Image': container,
        'ModelDataUrl': f"{output_s3_uri}/{best_job_name}/output/model.tar.gz"
    },
    ExecutionRoleArn=SAGEMAKER_IAM_ROLE
)

print(f"Created model: {model_name}")

# Store model metadata including the feature information
model_metadata = {
    'model_name': model_name,
    'training_job': best_job_name,
    'hyperparameters': {k: v for k, v in best_hyperparameters.items() if not k.startswith('_')},
    'creation_time': time.time(),
    'feature_info': feature_info  # This comes from the previous step
}

# Save model metadata to S3
metadata_key = f'{prefix}/models/{model_name}/metadata.json'
s3.Bucket(bucket).Object(metadata_key).put(
    Body=json.dumps(model_metadata, indent=2)
)

print(f"Saved model metadata to s3://{bucket}/{metadata_key}")


In [None]:
import time
import boto3
import os
import json
import logging

# Configure logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Initialize SageMaker client
sm_client = boto3.client('sagemaker')

# Create endpoint configuration with auto-scaling
endpoint_config_name = f"fraud-detection-config-{int(time.time())}"
endpoint_config = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[{
        'VariantName': 'default',
        'ModelName': model_name,
        'InitialInstanceCount': 1,
        'InstanceType': 'ml.t2.medium',
        'InitialVariantWeight': 1
    }]
)

logger.info(f"Created endpoint configuration: {endpoint_config_name}")

# Create endpoint
endpoint_name = f"{SOLUTION_PREFIX}-xgb"
endpoint = sm_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name
)
logger.info(f"Endpoint {endpoint_name} creation initiated")

# Wait for endpoint to become available
logger.info("Waiting for endpoint to be in service...")
waiter = sm_client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=endpoint_name)
logger.info(f"Endpoint {endpoint_name} is now in service")

# Store endpoint information in a central registry
endpoint_info = {
    'endpoint_name': endpoint_name,
    'model_name': model_name,
    'creation_time': time.time(),
    'instance_type': 'ml.t2.medium',
    'instance_count': 1,
    'endpoint_config': endpoint_config_name
}

# Save endpoint info to S3
endpoint_info_key = f'{prefix}/endpoints/{endpoint_name}/info.json'
s3.Bucket(bucket).Object(endpoint_info_key).put(
    Body=json.dumps(endpoint_info, indent=2)
)

print(f"Endpoint {endpoint_name} is ready for inference")


In [None]:
import boto3
import sagemaker
from sagemaker.serializers import CSVSerializer
import numpy as np
import time
import json
from tqdm import tqdm  # For progress bar

# Create a predictor for the endpoint
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=session,
    serializer=CSVSerializer()
)

# Test on a sample of the test set (first 200 records)
test_sample = X_test.head(10000)
y_test_sample = y_test.iloc[:10000]

# Print sample sizes
print(f"Using sample of {len(test_sample)} records for testing")
print(f"Sample fraud ratio: {y_test_sample.mean():.2f}")

# Get the expected feature count by reading the feature info from S3
def get_model_feature_count():
    try:
        # Try to get feature count from saved feature info
        feature_info_key = f'{prefix}/model/feature_info.json'
        response = s3.Bucket(bucket).Object(feature_info_key).get()
        feature_info = json.loads(response['Body'].read().decode('utf-8'))
        return len(feature_info.get('feature_names', []))
    except Exception as e:
        print(f"Could not get feature info from S3: {str(e)}")
        return 38  # Fallback to known count from the error message

# Get expected feature count for model
model_feature_count = get_model_feature_count()
print(f"Model expects {model_feature_count} features")

# Format features for prediction - ensuring we match the expected count
def format_features_for_prediction(row, feature_count=model_feature_count):
    """Format a row of features for prediction matching the expected feature count"""
    features_list = []
    
    # Get the first feature_count features (or pad with zeros if not enough)
    feature_keys = list(row.index)
    
    for i in range(feature_count):
        if i < len(feature_keys):
            feature = feature_keys[i]
            features_list.append(str(row[feature]))
        else:
            features_list.append('0')  # Pad with zeros if needed
    
    return ','.join(features_list)

# Get predictions for test data
print("Getting predictions for test data...")
y_pred_proba = []
batch_size = 100  # Process in smaller batches

# Use tqdm for a progress bar
for i in tqdm(range(0, len(test_sample), batch_size), desc="Processing batches"):
    batch = test_sample.iloc[i:i+batch_size]
    batch_features = [format_features_for_prediction(row) for _, row in batch.iterrows()]
    
    # Send each row separately to avoid CSV parsing issues
    batch_predictions = []
    for features_str in batch_features:
        try:
            response = predictor.predict(features_str)
            pred = float(response.decode('utf-8'))
            batch_predictions.append(pred)
        except Exception as e:
            print(f"Error with prediction: {str(e)}")
            print(f"Problematic features: {features_str[:100]}...")  # Print first 100 chars
            batch_predictions.append(0.5)  # Default fallback value
    
    y_pred_proba.extend(batch_predictions)
    
    # Small delay to avoid throttling
    time.sleep(0.1)

# Convert probabilities to binary predictions
y_pred = [1 if p >= 0.5 else 0 for p in y_pred_proba]

# Calculate metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test_sample, y_pred))

# Print confusion matrix
cm = confusion_matrix(y_test_sample, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test_sample, y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")

# Save test results to S3
test_results = {
    'endpoint_name': endpoint_name,
    'test_time': time.time(),
    'metrics': {
        'roc_auc': float(roc_auc),
        'confusion_matrix': cm.tolist(),
        'classification_report': classification_report(y_test_sample, y_pred, output_dict=True)
    },
    'test_size': len(test_sample),
    'positive_rate': float(sum(y_pred) / len(y_pred)),
    'true_positive_rate': float(sum([y_pred[i] * y_test_sample.iloc[i] for i in range(len(y_pred))]) / sum(y_test_sample)) if sum(y_test_sample) > 0 else 0
}

# Save test results to S3
test_results_key = f'{prefix}/endpoints/{endpoint_name}/test_results.json'
s3.Bucket(bucket).Object(test_results_key).put(
    Body=json.dumps(test_results, indent=2, default=str)
)
print(f"Saved test results to s3://{bucket}/{test_results_key}")

# Analyze a few sample transactions
def analyze_transactions(X_sample, y_sample, predictor, model_feature_count=model_feature_count):
    """Analyze predictions for a sample of transactions"""
    results = []
    for i, (idx, row) in enumerate(X_sample.iterrows()):
        features_str = format_features_for_prediction(row, feature_count=model_feature_count)
        try:
            response = predictor.predict(features_str)
            pred_probability = float(response.decode('utf-8'))
            is_fraud_prediction = pred_probability >= 0.5
            
            # Find the true label
            true_label = y_sample.iloc[i]
            
            # Determine if prediction was correct
            correct = (is_fraud_prediction == true_label)
            
            # Create a result dictionary
            result = {
                'index': idx,
                'prediction': is_fraud_prediction,
                'probability': pred_probability,
                'true_label': true_label,
                'correct': correct,
                # Include a few key features for analysis
                'features': {
                    'amount': row.get('amount', 'N/A'),
                    'is_vpn': row.get('is_vpn', 'N/A')
                }
            }
            results.append(result)
        except Exception as e:
            print(f"Error analyzing transaction {idx}: {str(e)}")
    
    return results

# Get small samples of fraud and non-fraud cases
fraud_indices = y_test_sample[y_test_sample == 1].index[:5]  # First 5 fraud cases
non_fraud_indices = y_test_sample[y_test_sample == 0].index[:5]  # First 5 non-fraud cases

fraud_sample = test_sample.loc[fraud_indices]
non_fraud_sample = test_sample.loc[non_fraud_indices]

# Analyze samples
print("\nAnalyzing sample transactions...")
if not fraud_sample.empty:
    print("\nSample Fraud Transactions Analysis:")
    fraud_analysis = analyze_transactions(fraud_sample, y_test_sample.loc[fraud_indices], predictor)
    for result in fraud_analysis:
        print(f"Transaction {result['index']}: Predicted {'FRAUD' if result['prediction'] else 'OK'} " 
            f"(probability: {result['probability']:.4f}), True label: {'FRAUD' if result['true_label'] else 'OK'}, "
            f"Correct: {result['correct']}")
        print(f"  Amount: {result['features']['amount']}, VPN: {result['features']['is_vpn']}")
else:
    print("No fraud samples in the test set subset")

if not non_fraud_sample.empty:
    print("\nSample Non-Fraud Transactions Analysis:")
    non_fraud_analysis = analyze_transactions(non_fraud_sample, y_test_sample.loc[non_fraud_indices], predictor)
    for result in non_fraud_analysis:
        print(f"Transaction {result['index']}: Predicted {'FRAUD' if result['prediction'] else 'OK'} " 
            f"(probability: {result['probability']:.4f}), True label: {'FRAUD' if result['true_label'] else 'OK'}, "
            f"Correct: {result['correct']}")
        print(f"  Amount: {result['features']['amount']}, VPN: {result['features']['is_vpn']}")
else:
    print("No non-fraud samples in the test set subset")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import json
import boto3
import time

# Evaluate model performance
print("Model Evaluation:")
report = classification_report(y_test, y_pred)
print(report)

# Create and display confusion matrix
print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--')  # Random prediction line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Calculate metrics for business context
fraud_detected = cm[1, 1]  # True positives
fraud_missed = cm[1, 0]    # False negatives
false_alarms = cm[0, 1]    # False positives
correct_negatives = cm[0, 0]  # True negatives

# Calculate important business metrics
detection_rate = fraud_detected / (fraud_detected + fraud_missed) if (fraud_detected + fraud_missed) > 0 else 0
false_positive_rate = false_alarms / (false_alarms + correct_negatives) if (false_alarms + correct_negatives) > 0 else 0
precision = fraud_detected / (fraud_detected + false_alarms) if (fraud_detected + false_alarms) > 0 else 0

print("\nBusiness Impact Metrics:")
print(f"Fraud Detection Rate: {detection_rate:.2%}")
print(f"False Alarm Rate: {false_positive_rate:.2%}")
print(f"Precision (% of flagged transactions that are actual fraud): {precision:.2%}")

# Save detailed model details for reference
model_details = {
    'endpoint_name': endpoint_name,
    'model_name': model_name,
    'features': features,
    'evaluation_timestamp': time.time(),
    'performance': {
        'roc_auc': float(roc_auc),
        'confusion_matrix': cm.tolist(),
        'classification_report': classification_report(y_test, y_pred, output_dict=True),
        'business_metrics': {
            'fraud_detection_rate': float(detection_rate),
            'false_alarm_rate': float(false_positive_rate),
            'precision': float(precision),
            'fraud_cases_detected': int(fraud_detected),
            'fraud_cases_missed': int(fraud_missed),
            'false_alarms': int(false_alarms)
        }
    },
    'model_parameters': {
        'threshold': 0.5,  # Default classification threshold
    }
}

# Save to S3
model_details_key = f'{prefix}/model-details.json'
s3 = boto3.resource('s3')
s3.Bucket(bucket).Object(model_details_key).put(
    Body=json.dumps(model_details, indent=2)
)
print(f"Model details saved to s3://{bucket}/{model_details_key}")

# Calculate performance at different thresholds to find optimal threshold
thresholds_to_try = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
threshold_metrics = []

for threshold in thresholds_to_try:
    # Convert probabilities to predictions using this threshold
    y_pred_at_threshold = [1 if p >= threshold else 0 for p in y_pred_proba]
    
    # Calculate confusion matrix at this threshold
    cm_at_threshold = confusion_matrix(y_test, y_pred_at_threshold)
    
    # Extract metrics
    tp = cm_at_threshold[1, 1]  # True positives
    fn = cm_at_threshold[1, 0]  # False negatives
    fp = cm_at_threshold[0, 1]  # False positives
    tn = cm_at_threshold[0, 0]  # True negatives
    
    # Calculate rates
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0
    false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    # Store metrics
    threshold_metrics.append({
        'threshold': threshold,
        'detection_rate': detection_rate,
        'false_alarm_rate': false_alarm_rate,
        'precision': precision,
        'true_positives': int(tp),
        'false_negatives': int(fn),
        'false_positives': int(fp),
        'true_negatives': int(tn)
    })

# Plot metrics across different thresholds
plt.figure(figsize=(10, 6))
thresholds = [m['threshold'] for m in threshold_metrics]
detection_rates = [m['detection_rate'] for m in threshold_metrics]
false_alarm_rates = [m['false_alarm_rate'] for m in threshold_metrics]
precisions = [m['precision'] for m in threshold_metrics]

plt.plot(thresholds, detection_rates, 'bo-', label='Detection Rate')
plt.plot(thresholds, false_alarm_rates, 'ro-', label='False Alarm Rate')
plt.plot(thresholds, precisions, 'go-', label='Precision')
plt.xlabel('Threshold')
plt.ylabel('Rate')
plt.title('Performance Metrics at Different Thresholds')
plt.legend()
plt.grid(True)
plt.show()

print("\nRecommended threshold based on business needs:")
# Find threshold with detection rate > 0.8 and lowest false alarm rate
good_thresholds = [m for m in threshold_metrics if m['detection_rate'] >= 0.8]
if good_thresholds:
    recommended = min(good_thresholds, key=lambda x: x['false_alarm_rate'])
    print(f"Threshold: {recommended['threshold']}")
    print(f"Detection Rate: {recommended['detection_rate']:.2%}")
    print(f"False Alarm Rate: {recommended['false_alarm_rate']:.2%}")
    print(f"Precision: {recommended['precision']:.2%}")
else:
    print("No threshold meets minimum detection rate requirement of 80%")



In [None]:
import random
import datetime
import uuid

def generate_test_transactions(num_transactions=10, fraud_ratio=0.3):

    test_transactions = []
    
    # Calculate counts
    num_fraud = int(num_transactions * fraud_ratio)
    num_legitimate = num_transactions - num_fraud
    
    # Generate legitimate transactions
    for _ in range(num_legitimate):
        transaction = generate_transaction(is_fraud=False)
        test_transactions.append(transaction)
    
    # Generate fraudulent transactions
    for _ in range(num_fraud):
        transaction = generate_transaction(is_fraud=True)
        test_transactions.append(transaction)
    
    # Shuffle to mix fraud and legitimate
    random.shuffle(test_transactions)
    
    return test_transactions

def generate_transaction(is_fraud=False):
    """Generate a single transaction with realistic properties"""
    # Generate a transaction ID
    transaction_id = f"T{uuid.uuid4().hex[:8].upper()}"
    
    # Generate a user ID
    user_id = f"U{random.randint(10000, 99999)}"
    
    # Generate timestamp (current time with small random offset)
    timestamp = datetime.datetime.now() - datetime.timedelta(
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
    )
    timestamp_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
    
    # Generate amount based on fraud flag
    if is_fraud:
        amount = round(random.uniform(800, 3000), 2) if random.random() < 0.7 else round(random.uniform(0.5, 20), 2)
    else:
        amount = round(random.uniform(50, 500), 2)
    
    # Generate device type
    device_options = ['mobile', 'desktop', 'tablet']
    if is_fraud:
        device_type = random.choices(device_options, weights=[0.7, 0.2, 0.1])[0]
    else:
        device_type = random.choices(device_options, weights=[0.4, 0.5, 0.1])[0]
    
    # Generate location
    locations = [
        'California, USA', 'New York, USA', 'Texas, USA', 'Florida, USA', 
        'Illinois, USA', 'London, UK', 'Paris, France', 'Berlin, Germany', 
        'Tokyo, Japan', 'Sydney, Australia'
    ]
    
    if is_fraud:
        location = random.choice(locations[5:])  # Foreign locations
    else:
        location = random.choice(locations[:5])  # US locations
    
    # Generate VPN usage
    if is_fraud:
        is_vpn = random.choices([True, False], weights=[0.7, 0.3])[0]
    else:
        is_vpn = random.choices([True, False], weights=[0.1, 0.9])[0]
    
    # Generate card type
    card_options = ['credit', 'debit', 'gift']
    if is_fraud:
        card_type = random.choices(card_options, weights=[0.5, 0.2, 0.3])[0]
    else:
        card_type = random.choices(card_options, weights=[0.4, 0.55, 0.05])[0]
    
    # Generate status
    status_options = ['approved', 'pending', 'declined']
    if is_fraud and random.random() < 0.3:
        status = random.choices(status_options, weights=[0.6, 0.2, 0.2])[0]
    else:
        status = random.choices(status_options, weights=[0.95, 0.03, 0.02])[0]
    
    # Create transaction
    return {
        'transaction_id': transaction_id,
        'user_id': user_id,
        'timestamp': timestamp_str,
        'amount': amount,
        'device_type': device_type,
        'location': location,
        'is_vpn': is_vpn,
        'card_type': card_type,
        'status': status
    }

# Example usage
if __name__ == "__main__":
    transactions = generate_test_transactions(20, fraud_ratio=0.3)
    print(f"Generated {len(transactions)} transactions")
    print(f"Sample transaction: {transactions[0]}")



In [None]:
import datetime
import boto3
import json
import logging
import numpy as np

# Configure logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

class TransactionFeatureExtractor:
    """
    """
    
    def __init__(self, s3_bucket=None, feature_info_key=None, dynamodb_table=None):
        self.s3_client = boto3.client('s3')
        self.dynamodb = boto3.resource('dynamodb')
        self.s3_bucket = s3_bucket
        self.feature_info_key = feature_info_key
        self.dynamodb_table = dynamodb_table
        
        # Default feature lists in case we can't load from S3
        self.feature_names = []
        self.categorical_features = []
        self.numeric_features = []
        
        # Load feature info if provided
        if s3_bucket and feature_info_key:
            self.load_feature_info()
        
        # Cache for location and device fraud rates
        self.location_fraud_rates = {}
        self.device_fraud_rates = {}
        self.card_fraud_rates = {}
        
        # Load fraud rates if possible
        self.load_fraud_rates()
    
    def load_feature_info(self):
        """Load feature information from S3"""
        try:
            response = self.s3_client.get_object(
                Bucket=self.s3_bucket,
                Key=self.feature_info_key
            )
            feature_info = json.loads(response['Body'].read().decode('utf-8'))
            
            self.feature_names = feature_info.get('feature_names', [])
            self.categorical_features = feature_info.get('categorical_features', [])
            self.numeric_features = feature_info.get('numeric_features', [])
            
            logger.info(f"Loaded feature info with {len(self.feature_names)} features")
        except Exception as e:
            logger.error(f"Error loading feature info: {str(e)}")
    
    def load_fraud_rates(self):
        """Load fraud rates for categorical features if available"""
        try:
            fraud_rates_key = 'models/fraud_rates.json'
            response = self.s3_client.get_object(
                Bucket=self.s3_bucket,
                Key=fraud_rates_key
            )
            fraud_rates = json.loads(response['Body'].read().decode('utf-8'))
            
            self.location_fraud_rates = fraud_rates.get('location', {})
            self.device_fraud_rates = fraud_rates.get('device_type', {})
            self.card_fraud_rates = fraud_rates.get('card_type', {})
            
            logger.info("Loaded fraud rates for categorical features")
        except Exception as e:
            logger.warning(f"Could not load fraud rates, using defaults: {str(e)}")
            # Set default rates based on historical data analysis
            self.location_fraud_rates = {
                'California, USA': 0.02, 'New York, USA': 0.02, 'Texas, USA': 0.02,
                'Florida, USA': 0.03, 'Illinois, USA': 0.02, 'London, UK': 0.03,
                'Paris, France': 0.04, 'Berlin, Germany': 0.04, 'Tokyo, Japan': 0.05,
                'Sydney, Australia': 0.05, 'Unknown': 0.09
            }
            self.device_fraud_rates = {'mobile': 0.03, 'desktop': 0.01, 'tablet': 0.02}
            self.card_fraud_rates = {'credit': 0.025, 'debit': 0.01, 'gift': 0.05}
    
    def get_user_stats(self, user_id):
        """
        Get user transaction statistics from DynamoDB.
        In production, this would query a database of user history.
        
        Returns a dictionary of user stats or None if not found
        """
        if not self.dynamodb_table:
            return None
        
        try:
            table = self.dynamodb.Table(self.dynamodb_table)
            response = table.get_item(Key={'user_id': user_id})
            
            if 'Item' in response:
                return response['Item'].get('stats', {})
        except Exception as e:
            logger.error(f"Error retrieving user stats: {str(e)}")
        
        # Return default values if no data found or error
        return {
            'transaction_count': 1,
            'avg_amount': 0,
            'std_amount': 0,
            'max_amount': 0,
            'vpn_ratio': 0
        }
    
    def extract_features(self, transaction):
        # Create a dictionary to hold all features
        features_dict = {}
        
        # Basic transaction features
        features_dict['amount'] = float(transaction.get('amount', 0))
        features_dict['is_vpn'] = 1 if transaction.get('is_vpn', False) else 0
        
        # Time-based features
        try:
            if isinstance(transaction.get('timestamp'), str):
                timestamp = datetime.datetime.strptime(
                    transaction.get('timestamp'),
                    '%Y-%m-%dT%H:%M:%SZ'
                )
            else:
                timestamp = transaction.get('timestamp', datetime.datetime.now())
                
            features_dict['hour_of_day'] = timestamp.hour
            features_dict['day_of_week'] = timestamp.weekday()
            features_dict['month'] = timestamp.month
            features_dict['day_of_month'] = timestamp.day
            features_dict['is_weekend'] = 1 if timestamp.weekday() >= 5 else 0
            features_dict['is_night'] = 1 if (timestamp.hour < 6 or timestamp.hour >= 22) else 0
        except Exception as e:
            logger.warning(f"Error processing timestamp: {str(e)}")
            features_dict['hour_of_day'] = 0
            features_dict['day_of_week'] = 0
            features_dict['month'] = 1
            features_dict['day_of_month'] = 1
            features_dict['is_weekend'] = 0
            features_dict['is_night'] = 0
        
        # Get user statistics for behavioral features
        user_id = transaction.get('user_id')
        user_stats = self.get_user_stats(user_id)
        
        if user_stats:
            # User transaction count
            features_dict['user_transaction_count'] = user_stats.get('transaction_count', 1)
            
            # User average amount
            user_avg_amount = user_stats.get('avg_amount', 0)
            features_dict['user_avg_amount'] = user_avg_amount
            
            # User amount standard deviation
            user_amount_std = user_stats.get('std_amount', 1)
            features_dict['user_amount_std'] = user_amount_std
            
            # User min and max amount
            features_dict['user_min_amount'] = user_stats.get('min_amount', 0)
            features_dict['user_max_amount'] = user_stats.get('max_amount', 0)
            
            # VPN usage ratio
            features_dict['user_vpn_ratio'] = user_stats.get('vpn_ratio', 0)
            
            # Calculate z-score
            if user_amount_std > 0:
                features_dict['amount_zscore'] = (features_dict['amount'] - user_avg_amount) / user_amount_std
            else:
                features_dict['amount_zscore'] = 0
                
            # Calculate amount to max ratio
            max_amount = user_stats.get('max_amount', 0)
            if max_amount > 0:
                features_dict['amount_to_max_ratio'] = features_dict['amount'] / max_amount
            else:
                features_dict['amount_to_max_ratio'] = 1
                
            # Is this an unusual amount for the user?
            is_unusual = 0
            if user_amount_std > 0 and abs(features_dict['amount_zscore']) > 2:
                is_unusual = 1
            features_dict['is_unusual_amount'] = is_unusual
            
            # Is this a new user?
            features_dict['is_new_user'] = 1 if features_dict['user_transaction_count'] <= 3 else 0
            
            # Is this the user's largest transaction?
            features_dict['is_largest_tx'] = 1 if features_dict['amount'] >= max_amount * 0.95 else 0
        else:
            # Default values for new users or if user data not available
            features_dict['user_transaction_count'] = 1
            features_dict['user_avg_amount'] = 0
            features_dict['user_amount_std'] = 1
            features_dict['user_min_amount'] = 0
            features_dict['user_max_amount'] = 0
            features_dict['user_vpn_ratio'] = 0
            features_dict['amount_zscore'] = 0
            features_dict['amount_to_max_ratio'] = 1
            features_dict['is_unusual_amount'] = 0
            features_dict['is_new_user'] = 1
            features_dict['is_largest_tx'] = 1
        
        # Location risk score based on historical fraud rates
        location = transaction.get('location', 'Unknown')
        features_dict['location_fraud_rate'] = self.location_fraud_rates.get(location, 0.05)
        
        # Device fraud rate
        device_type = transaction.get('device_type', 'unknown')
        features_dict['device_fraud_rate'] = self.device_fraud_rates.get(device_type, 0.02)
        
        # Card fraud rate
        card_type = transaction.get('card_type', 'unknown')
        features_dict['card_fraud_rate'] = self.card_fraud_rates.get(card_type, 0.02)
        
        # Create one-hot encoded features for categorical variables
        # Location one-hot encoding
        for loc in self.location_fraud_rates.keys():
            features_dict[f'location_{loc.replace(", ", "_").lower()}'] = 1 if location == loc else 0
            
        # Device type one-hot encoding
        for dev in ['mobile', 'desktop', 'tablet']:
            features_dict[f'device_type_{dev}'] = 1 if device_type == dev else 0
            
        # Card type one-hot encoding
        for card in ['credit', 'debit', 'gift']:
            features_dict[f'card_type_{card}'] = 1 if card_type == card else 0
            
        # Transaction status one-hot encoding
        status = transaction.get('status', 'approved')
        for st in ['approved', 'pending', 'declined']:
            features_dict[f'status_{st}'] = 1 if status == st else 0
        
        # Assemble feature string in the correct order based on feature names
        feature_values = []
        for feature in self.feature_names:
            if feature in features_dict:
                feature_values.append(str(features_dict[feature]))
            else:
                feature_values.append('0')  # Default value for missing features
        
        return ','.join(feature_values)
    
    def format_batch_for_prediction(self, transactions):
        return [self.extract_features(tx) for tx in transactions]


# Example usage:
def process_transactions_for_prediction(transactions, model_bucket=None):
    # Initialize feature extractor
    extractor = TransactionFeatureExtractor(
        s3_bucket=model_bucket or os.environ.get('MODEL_DATA_S3_BUCKET', 'ssense-fraud-model-data'),
        feature_info_key='fraud-classifier/model/feature_info.json',
        dynamodb_table=os.environ.get('TRANSACTION_HISTORY_TABLE', 'ssense-fraud-user-history')
    )
    
    # Process transactions
    return extractor.format_batch_for_prediction(transactions)



## Kinesis Integration for Real-time Processing


In [None]:
def send_transactions_to_kinesis(transactions, stream_name=config.KINESIS_STREAM_NAME):
    """Send transactions to Kinesis for real-time processing"""
    kinesis = boto3.client('kinesis')
    
    for transaction in transactions:
        # Convert transaction to JSON
        transaction_json = json.dumps(transaction)
        
        # Send to Kinesis
        response = kinesis.put_record(
            StreamName=stream_name,
            Data=transaction_json,
            PartitionKey=transaction['transaction_id']
        )
        
        print(f"Sent transaction {transaction['transaction_id']} to Kinesis: Shard {response['ShardId']}")
        
        # Small delay to avoid throttling
        time.sleep(0.1)

# Generate a batch of transactions and send to Kinesis
kinesis_test_transactions = generate_test_transactions(10, fraud_ratio=0.2)
send_transactions_to_kinesis(kinesis_test_transactions)

print(f"Sent {len(kinesis_test_transactions)} transactions to Kinesis stream {config.KINESIS_STREAM_NAME}")
print("These will be processed by the Lambda function and results stored in DynamoDB.")

## Dashboard for Monitoring

In [None]:
def check_dynamodb_for_results(transaction_ids, table_name=config.DYNAMODB_TABLE):
    """Check DynamoDB for processing results of test transactions"""
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table(table_name)
    
    results = []
    
    for tx_id in transaction_ids:
        try:
            response = table.get_item(Key={'transaction_id': tx_id})
            if 'Item' in response:
                results.append(response['Item'])
            else:
                print(f"Transaction {tx_id} not found in DynamoDB yet.")
        except Exception as e:
            print(f"Error retrieving transaction {tx_id}: {str(e)}")
    
    return results

# Give Lambda some time to process
print("Waiting for Lambda to process transactions...")
time.sleep(10)  # Wait 10 seconds

# Check DynamoDB for results
tx_ids = [tx['transaction_id'] for tx in kinesis_test_transactions]
results = check_dynamodb_for_results(tx_ids)

# Display results
if results:
    results_df = pd.DataFrame(results)
    print(f"Found {len(results)} transactions in DynamoDB")
    
    # Plot fraud probabilities
    plt.figure(figsize=(10, 6))
    sns.barplot(x='transaction_id', y='fraud_probability', data=results_df)
    plt.title('Fraud Probabilities for Test Transactions')
    plt.xlabel('Transaction ID')
    plt.ylabel('Fraud Probability')
    plt.xticks(rotation=45)
    plt.axhline(y=0.5, color='r', linestyle='--')
    plt.tight_layout()
    plt.show()
else:
    print("No results found in DynamoDB yet. Lambda might still be processing or there might be an issue.")