In [1]:
!pip install pandas scikit-learn sagemaker boto3 xgboost



In [2]:
import boto3
import sagemaker
import pandas as pd
from sklearn.model_selection import train_test_split
from sagemaker.xgboost.estimator import XGBoost

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
# Replace with your actual S3 bucket and prefix
bucket = 'dataset-paysim'  # Remove 's3://' and trailing slash
prefix = 'preprocessed-data/'  # Keep the prefix as it is

s3_data_path = f's3://dataset-paysim/preprocessed-data/'

In [4]:
s3 = boto3.client('s3') 

response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

if 'Contents' in response:
    objects = response['Contents']
    print("Objects found in S3:", [obj['Key'] for obj in objects])  # Debugging step


Objects found in S3: ['preprocessed-data//preprocessed_data.csv']


In [5]:
import boto3
import pandas as pd

# Corrected bucket name
bucket = 'dataset-paysim'  # No 's3://' and no trailing '/'
prefix = 'preprocessed-data/'  # Check if this is correct

# List all files in the S3 prefix
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

if 'Contents' in response:
    objects = response['Contents']
    print("Objects found in S3:", [obj['Key'] for obj in objects])  # Debugging step

    all_data = []

    for obj in objects:
        if obj['Key'].endswith('.parquet'):  # Adapt for your file type (e.g., '.csv')
            file_path = obj['Key']
            s3_file = s3.get_object(Bucket=bucket, Key=file_path)
            df = pd.read_parquet(s3_file['Body'])  # Use pd.read_csv if it's a CSV
            all_data.append(df)

    if all_data:  # Only concatenate if data exists
        df = pd.concat(all_data, ignore_index=True)
        print(df.head())
    else:
        print("No valid Parquet files found in the given S3 prefix.")
else:
    print("No files found in the given S3 prefix.")

Objects found in S3: ['preprocessed-data//preprocessed_data.csv']
No valid Parquet files found in the given S3 prefix.


In [6]:
# Corrected bucket name
bucket = 'dataset-paysim'  # No 's3://' and no trailing '/'
prefix = 'preprocessed-data/'  # Check if this is correct

s3 = boto3.client('s3')

# List all files in the S3 prefix
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

if 'Contents' in response:
    objects = response['Contents']
    print("Objects found in S3:", [obj['Key'] for obj in objects])  # Debugging step

    all_data = []
    
    if obj['Key'].endswith('.csv'):
        file_path = obj['Key']
        s3_file = s3.get_object(Bucket=bucket, Key=file_path)
        df = pd.read_csv(s3_file['Body'])  # Read CSV instead of Parquet
        all_data.append(df)

    if all_data:  # Only concatenate if data exists
        df = pd.concat(all_data, ignore_index=True)
        print(df.head())

    else:
        print("No valid Parquet files found in the given S3 prefix.")
else:
    print("No files found in the given S3 prefix.")

Objects found in S3: ['preprocessed-data//preprocessed_data.csv']
   step    amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0     1   9839.64       170136.0       160296.36             0.0   
1     1   1864.28        21249.0        19384.72             0.0   
2     1    181.00          181.0            0.00             0.0   
3     1    181.00          181.0            0.00         21182.0   
4     1  11668.14        41554.0        29885.86             0.0   

   newbalanceDest  isFraud  isFlaggedFraud  type_CASH_IN  type_CASH_OUT  \
0             0.0        0               0         False          False   
1             0.0        0               0         False          False   
2             0.0        1               0         False          False   
3             0.0        1               0         False           True   
4             0.0        0               0         False          False   

   type_DEBIT  type_PAYMENT  type_TRANSFER  
0       False          True  

In [8]:
# Separate features (X) and target variable (y)

# df = pd.read_csv('path/to/your/data.csv')

X = df.drop('isFraud', axis=1)  # Replace 'isFraud' with your target column name
y = df['isFraud']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test_size and random_state as needed

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 5090096
Test set size: 1272524


In [9]:
import os

# Create a directory to store the training data
train_data_dir = 'train_data'
os.makedirs(train_data_dir, exist_ok=True)  # Create if it doesn't exist

# Combine X_train and y_train into a single DataFrame for saving
train_df = pd.concat([X_train, y_train], axis=1)

# Save the training data to a CSV file
train_df.to_csv(os.path.join(train_data_dir, 'train.csv'), index=False)

# Upload the training data to S3
sagemaker_session = sagemaker.Session()
train_data_s3_path = sagemaker_session.upload_data(path=train_data_dir, bucket=bucket, key_prefix='train')

print(f"Training data uploaded to: {train_data_s3_path}")

Training data uploaded to: s3://dataset-paysim/train


In [11]:
# Create a directory to store the test data
test_data_dir = 'test_data'
os.makedirs(test_data_dir, exist_ok=True)  # Create if it doesn't exist

# Combine X_test and y_test into a single DataFrame for saving
test_df = pd.concat([X_test, y_test], axis=1)

# Save the test data to a CSV file
test_df.to_csv(os.path.join(test_data_dir, 'test.csv'), index=False)

# Upload the training data to S3
sagemaker_session = sagemaker.Session()
test_data_s3_path = sagemaker_session.upload_data(path=test_data_dir, bucket=bucket, key_prefix='test')

print(f"Testing data uploaded to: {test_data_s3_path}")

Testing data uploaded to: s3://dataset-paysim/test


In [12]:
from sagemaker.inputs import TrainingInput

# Define the training input channel
train_input = TrainingInput(
    s3_data=train_data_s3_path,
    content_type='csv'
)

data_channels = {'train': train_input}

In [13]:
# Define the Sagemaker execution role
role = sagemaker.get_execution_role()

# Define the S3 output path for saving the model
output_path = f's3://dataset-paysim/xgboost-model-output'

# Configure the XGBoost estimator
xgb = XGBoost(
    entry_point='xgboost_script.py',  # training script (see below)
    instance_type='ml.m5.xlarge',
    instance_count=1,
    framework_version='1.0-1',  # Or the latest version
    output_path=output_path,
    sagemaker_session=sagemaker.Session(),
    role=role,
    objective='binary:logistic',
    num_round=100
)

In [14]:
xgb.fit(data_channels)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-02-16-14-43-15-284


2025-02-16 14:43:17 Starting - Starting the training job...
2025-02-16 14:43:46 Downloading - Downloading input data......
2025-02-16 14:44:21 Downloading - Downloading the training image..[34m[2025-02-16 14:44:58.427 ip-10-0-152-8.ap-southeast-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Invoking user training script.[0m
[34mINFO:sagemaker-containers:Module xgboost_script does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34mINFO:sagemaker-containers:Generating setup.cfg[0m
[34mINFO:sagemaker-containers:Generating MANIFEST.in[0m
[34mINFO:sagemaker-containers:Installing module with the following command:[0m
[34m/miniconda3/bin/python3 -m pip install . [0m
[34mProcessing /opt/ml/code
  Preparing metadata (setup.p

In [16]:
from sagemaker.xgboost.model import XGBoostModel
from sagemaker import get_execution_role

# Get the trained model location from S3 (Check this path in the SageMaker console)
model_data_path = 's3://dataset-paysim/xgboost-model-output/sagemaker-xgboost-2025-02-16-06-29-54-221/output/model.tar.gz'  # Replace with actual path

# Get the IAM role
role = get_execution_role()

# Create an XGBoost Model object
xgb_model = XGBoostModel(
    model_data=model_data_path,
    role=role,
    framework_version='1.0-1'
)

# Deploy the model to a SageMaker endpoint
predictor = xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    endpoint_name='fraud-detection-ep1',
    serializer=sagemaker.serializers.CSVSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer()
)

print("Model successfully deployed!")


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-02-16-14-50-53-188
INFO:sagemaker:Creating endpoint-config with name fraud-detection-ep1
INFO:sagemaker:Creating endpoint with name fraud-detection-ep1


-----!Model successfully deployed!


In [18]:
import boto3

sm_client = boto3.client("sagemaker")

endpoint_name = "fraud-detection-ep1"  # Ensure this matches your deployment

response = sm_client.describe_endpoint(EndpointName=endpoint_name)
print("Endpoint Status:", response["EndpointStatus"])


Endpoint Status: InService


In [26]:
import numpy as np
import pandas as pd
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# Initialize predictor
predictor = sagemaker.predictor.Predictor(
    endpoint_name="fraud-detection-ep1",
    serializer=CSVSerializer(),  # Ensures data is sent in correct format
    deserializer=JSONDeserializer()  # Parses JSON response
)

# Ensure all data is numeric
sample_data = X_test.iloc[:1].copy() # Select one row for prediction
sample_data = sample_data.astype(float)  # Convert to float if necessary

# Convert to numpy array
sample_data_array = sample_data.to_numpy()

# Convert to CSV format (without headers)
payload = ",".join(map(str, sample_data_array.flatten()))

# Make prediction
prediction = predictor.predict(payload)
print("Fraud Prediction:", prediction)



Fraud Prediction: 9.382477401231881e-06


ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (415) from primary with message "Loading csv data failed with Exception, please ensure data is in csv format:
 <class 'ValueError'>
 could not convert string to float: 'True'". See https://ap-southeast-2.console.aws.amazon.com/cloudwatch/home?region=ap-southeast-2#logEventViewer:group=/aws/sagemaker/Endpoints/fraud-detection-ep in account 985539759309 for more information.

In [27]:
#### Amazon CloudWatch for Real-Time Monitoring

In [28]:
import boto3
import json
from datetime import datetime

# Initialize CloudWatch Logs
logs_client = boto3.client("logs")

log_group = "FraudDetectionLogs"
log_stream = "Predictions"

# Create Log Group and Log Stream if not exists
try:
    logs_client.create_log_group(logGroupName=log_group)
except logs_client.exceptions.ResourceAlreadyExistsException:
    pass

try:
    logs_client.create_log_stream(logGroupName=log_group, logStreamName=log_stream)
except logs_client.exceptions.ResourceAlreadyExistsException:
    pass

# Log fraud prediction
log_event = {
    "timestamp": int(datetime.utcnow().timestamp() * 1000),
    "message": json.dumps({"prediction": prediction, "timestamp": str(datetime.utcnow())})
}

logs_client.put_log_events(
    logGroupName=log_group,
    logStreamName=log_stream,
    logEvents=[log_event]
)

print("Logged prediction to CloudWatch.")

Logged prediction to CloudWatch.


In [29]:
import boto3
import csv

s3_bucket = "dataset-paysim"
s3_path = "fraud-detection-results/predictions.csv"

# Convert prediction to CSV format
with open("/tmp/predictions.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["TransactionID", "Prediction"])
    writer.writerow([123456, prediction])  # Example transaction ID

# Upload results to S3
s3 = boto3.client("s3")
s3.upload_file("/tmp/predictions.csv", s3_bucket, s3_path)
print("Predictions stored in S3 for Athena analysis.")


Predictions stored in S3 for Athena analysis.


In [30]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: fraud-detection-ep1
INFO:sagemaker:Deleting endpoint with name: fraud-detection-ep1


## Hyperparameter Tuning

In [19]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

# Define hyperparameter ranges
hyperparameter_ranges = {
    'num_round': IntegerParameter(50, 200),
    'eta': ContinuousParameter(0.01, 0.3),
    'max_depth': IntegerParameter(3, 7)
}

# Objective metric name
objective_metric_name = 'validation:auc'

# Create Hyperparameter Tuner
tuner = HyperparameterTuner(
    estimator=xgb,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    max_jobs=3,
    max_parallel_jobs=1
)

# Fit the tuner with data channels
tuner.fit(data_channels)

INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-xgboost-250216-1235


.............................................................................................................................................*


UnexpectedStatusException: Error for HyperParameterTuning job sagemaker-xgboost-250216-1235: Failed. Reason: No objective metrics found after running 3 training jobs. Please ensure that the custom algorithm is emitting the objective metric as defined by the regular expression provided.

In [22]:
# 4. Create a SageMaker Model object
# This represents your trained model and its inference environment.

model = XGBoostModel(
    model_data=model_data,
    role=role,
    image_uri=image_uri,
    sagemaker_session=sagemaker.Session()
)

# 5. Create a unique endpoint name
timestamp = int(time.time())
unique_endpoint_name = f"sagemaker-xgboost-{timestamp}"

# 6. Deploy the model
# This creates the endpoint configuration and the endpoint itself.
predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    endpoint_name=unique_endpoint_name
)


AttributeError: 'NoneType' object has no attribute 'split'