## Step 1: Import Packages and Declare Constants

In [1]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd

sagemaker.config INFO - Fetched defaults config from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [2]:
#Replace this value with the S3 Bucket Created
# default_bucket = "amazon-sagemaker-196132265367-us-west-2-f2b2cbe646a5"
default_bucket = "amazon-sagemaker-196132265367-us-west-2-f2b2cbe646a5"

In [3]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn',version=sklearn_processor_version,region=region)

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


## Step 2: Generate Baseline Dataset

Baseline Data will be used as part of SageMaker Clarify Step to generate SHAP Values 

In [4]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    ## Convert to datetime columns
    df["firstorder"]=pd.to_datetime(df["firstorder"],errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"],errors='coerce')
    ## Drop Rows with null values
    df = df.dropna()
    ## Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    ## Create Column which gives the days between when the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff']=(df['created']-df['firstorder']).dt.days
    ## Drop Columns
    df.drop(['custid','created','firstorder','lastorder'],axis=1,inplace=True)
    ## Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df,prefix=['favday','city'],columns=['favday','city'])
    return df

In [5]:
baseline_data = preprocess_data("data/storedata_total.csv")
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)

In [6]:
pd.DataFrame(baseline_sample).to_csv("data/baseline.csv",header=False,index=False)

## Step 3: Generate Batch Dataset

In [7]:
batch_data = preprocess_data("data/storedata_total.csv")
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)

In [8]:
pd.DataFrame(batch_sample).to_csv("data/batch.csv",header=False,index=False)

## Step 4: Copy Data and Scripts to S3 Bucket

In [9]:
s3_client = boto3.resource('s3')
prefix = "dzd_673oui3ih26low/4s1wu2xctgm8pc/"
s3_client.Bucket(default_bucket).upload_file("data/storedata_total.csv", prefix + "data/storedata_total.csv")
s3_client.Bucket(default_bucket).upload_file("data/batch.csv", prefix + "data/batch/batch.csv")
s3_client.Bucket(default_bucket).upload_file("data/baseline.csv", prefix + "input/baseline/baseline.csv")

In [11]:
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/preprocess.py", prefix + "input/code/preprocess.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/evaluate.py", prefix + "input/code/evaluate.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/generate_config.py", prefix + "input/code/generate_config.py")

## Step 5: Get the Pipeline Instance

In [12]:
from pipelines.customerchurn.pipeline import get_pipeline

pipeline = get_pipeline(
    region = region,
    role=role,
    default_bucket=default_bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    custom_image_uri=clarify_image,
    sklearn_processor_version=sklearn_processor_version
)

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.SecurityGroupIds
sagemaker.config INFO - Applied value from config key = SageMaker.TrainingJob.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.TrainingJob.VpcConfig.SecurityGroupIds
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.SecurityGroupIds
sagemaker.config INFO - Applied value from config key = SageMaker.Model.VpcConfig
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Ap

In [13]:
pipeline.definition()

'{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "ProcessingInstanceCount", "Type": "Integer", "DefaultValue": 1}, {"Name": "TrainingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "InputData", "Type": "String", "DefaultValue": "s3://amazon-sagemaker-196132265367-us-west-2-f2b2cbe646a5/data/storedata_total.csv"}, {"Name": "BatchData", "Type": "String", "DefaultValue": "s3://amazon-sagemaker-196132265367-us-west-2-f2b2cbe646a5/data/batch/batch.csv"}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ChurnModelProcess", "Type": "Processing", "Arguments": {"ProcessingResources": {"ClusterConfig": {"InstanceType": {"Get": "Parameters.ProcessingInstanceType"}, "InstanceCount": {"Get": "Parameters.ProcessingInstanceCount"}, "VolumeSizeInGB": 30}}, "AppSpec

## Step 5: Submit the pipeline to SageMaker and start execution

In [14]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-west-2:196132265367:pipeline/ChurnModelSMPipeline',
 'ResponseMetadata': {'RequestId': 'bd16fb1c-2683-40fe-9de5-5ff07f2c2fdf',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'bd16fb1c-2683-40fe-9de5-5ff07f2c2fdf',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Sat, 13 Sep 2025 18:33:04 GMT'},
  'RetryAttempts': 0}}

Start Pipeline Execution

In [15]:
execution = pipeline.start()

Now we describe execution instance and list the steps in the execution to find out more about the execution.

In [16]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-west-2:196132265367:pipeline/ChurnModelSMPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-west-2:196132265367:pipeline/ChurnModelSMPipeline/execution/bd0r500l2jtr',
 'PipelineExecutionDisplayName': 'execution-1757788384602',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2025, 9, 13, 18, 33, 4, 532000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 9, 13, 18, 33, 4, 532000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-west-2:196132265367:user-profile/d-s0lkoulqjp2c/71bbd3f5-10c5-42ef-92d5-365e6054a3fa',
  'UserProfileName': '71bbd3f5-10c5-42ef-92d5-365e6054a3fa',
  'DomainId': 'd-s0lkoulqjp2c',
  'IamIdentity': {'Arn': 'arn:aws:sts::196132265367:assumed-role/datazone_usr_role_4s1wu2xctgm8pc_3u7qowvay08ahc/SageMaker',
   'PrincipalId': 'AROAS3KTE3WL55ZTNQZL2:SageMaker',
   'SourceIdentity': '71bbd3f5-10c5-42ef-92d5-365e6054a3fa'}},
 'LastModifiedBy': {'UserProfileArn

We can list the execution steps to check out the status and artifacts:

In [17]:
execution.list_steps()

[]