In [1]:
%%html
<style>
table {float:left}
</style>

In [None]:
!conda install -c conda-forge shap --yes
!pip install smdebug --upgrade

In [5]:
import re
# import s3fs
import shap
import time
import boto3
import pandas as pd
import numpy as np

from itertools import islice
import matplotlib.pyplot as plt

import sagemaker
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.debugger import DebuggerHookConfig,CollectionConfig
from sagemaker.debugger import rule_configs, Rule
from smdebug.trials import create_trial
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
[2024-11-18 23:23:03.946 default:2263 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None


In [6]:
#Replace this value with the S3 Bucket Created

default_bucket = "sagemaker-us-east-1-418295711785" #"customer-churn-sm-pipeline"

In [7]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sagemaker_session.boto_region_name
# Retrieve the default S3 bucket
default_bucket = sagemaker_session.default_bucket()

In [8]:
## Preprocess the dataset
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    ## Convert to datetime columns
    df["firstorder"]=pd.to_datetime(df["firstorder"],errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"],errors='coerce')
    ## Drop Rows with null values
    df = df.dropna()
    ## Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    ## Create Column which gives the days between when the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff']=(df['created']-df['firstorder']).dt.days
    ## Drop Columns
    df.drop(['custid','created','firstorder','lastorder'],axis=1,inplace=True)
    ## Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df,prefix=['favday','city'],columns=['favday','city'])
    return df

In [5]:
## Set the required configurations
model_name = "churn_model"
env = "dev"
## S3 Bucket
default_bucket = "customer-churn-sm-pipeline"
## Preprocess the dataset
storedata = preprocess_data("data/storedata_total.csv")

In [6]:
storedata.head()

Unnamed: 0,retained,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,first_last_days_diff,...,favday_Monday,favday_Saturday,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BLR,city_BOM,city_DEL,city_MAA
0,0,29,100.0,3.448276,14.52,0.0,0,0,0,0,...,True,False,False,False,False,False,False,False,True,False
1,1,95,92.631579,10.526316,83.69,0.181641,1,1,1,1024,...,False,False,False,False,False,False,False,False,True,False
2,0,0,0.0,0.0,33.58,0.059908,0,0,0,217,...,False,False,False,False,False,True,False,False,True,False
3,0,0,0.0,0.0,54.96,0.0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
4,1,30,90.0,13.333333,111.91,0.00885,0,0,0,791,...,True,False,False,False,False,False,False,True,False,False


In [8]:
def split_datasets(df):
    y=df.pop("retained")
    X_pre = df
    y_pre = y.to_numpy().reshape(len(y),1)
    feature_names = list(X_pre.columns)
    X= np.concatenate((y_pre,X_pre),axis=1)
    np.random.shuffle(X)
    train,validation,test=np.split(X,[int(.7*len(X)),int(.85*len(X))])
    return feature_names,train,validation,test

In [9]:
# Split dataset
feature_names,train,validation,test = split_datasets(storedata)

In [11]:
# Save datasets in Amazon S3
pd.DataFrame(train).to_csv("data/train/train.csv",header=False,index=False)
pd.DataFrame(validation).to_csv("data/validation/validation.csv",header=False,index=False)
pd.DataFrame(test).to_csv("data/test/test.csv",header=False,index=False)

In [18]:
# Training and Validation Input for SageMaker Training job
s3_input_train = TrainingInput(
    s3_data="data/train/",content_type="csv")
s3_input_validation = TrainingInput(
    s3_data="data/validation/",content_type="csv")

In [16]:
# Hyperparameter used
fixed_hyperparameters = {
    "eval_metric":"auc",
    "objective":"binary:logistic",
    "num_round":"100",
    "rate_drop":"0.3",
    "tweedie_variance_power":"1.4"
}

In [26]:
# Use the built-in SageMaker algorithm

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sess.boto_region_name
container = sagemaker.image_uris.retrieve("xgboost",region,"0.90-2")

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    hyperparameters=fixed_hyperparameters,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(default_bucket),
    sagemaker_session=sess
)

hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "min_child_weight": ContinuousParameter(1, 10),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
}
objective_metric_name = "validation:auc"
tuner = HyperparameterTuner(
estimator, objective_metric_name,
hyperparameter_ranges,max_jobs=10,max_parallel_jobs=2)

In [27]:
# Tune
tuner.fit({
    "train":s3_input_train,
    "validation":s3_input_validation
    },include_cls_metadata=False)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


ClientError: An error occurred (ValidationException) when calling the CreateHyperParameterTuningJob operation: 2 validation errors detected: Value 'data/train/' at 'trainingJobDefinition.inputDataConfig.1.member.dataSource.s3DataSource.s3Uri' failed to satisfy constraint: Member must satisfy regular expression pattern: ^(https|s3)://([^/]+)/?(.*)$; Value 'data/validation/' at 'trainingJobDefinition.inputDataConfig.2.member.dataSource.s3DataSource.s3Uri' failed to satisfy constraint: Member must satisfy regular expression pattern: ^(https|s3)://([^/]+)/?(.*)$