# Predicting whether to contact a customer because they are at risk of churning

For updates on the way Sagemaker or AWS behave compared to the notebook code, please refer to https://livebook.manning.com/#!/book/machine-learning-for-business/chapter-3/v-5/119

## Part 1: Load and examine the data

In [8]:
data_bucket = '' # we should use sagemaker defalut s3 bucket
subfolder = 'ch03'
dataset = 'churn_data.csv'

In [None]:
import pandas as pd
from time import sleep

import boto3
import sagemaker
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

role = sagemaker.get_execution_role()
sess = sagemaker.session.Session()
data_bucket = sess.default_bucket()

print(f'SageMaker exectuion role: {role},  default s3 bucket: {data_bucket}')

In [31]:
df = pd.read_csv(f'./{dataset}')
df.head()

Unnamed: 0,churned,id,customer_code,co_name,total_spend,week_minus_4,week_minus_3,week_minus_2,last_week,4-3_delta,3-2_delta,2-1_delta
0,0,1,1826,Hoffman Martinez and Chandler,68567.34,0.81,0.02,0.74,1.45,-0.79,0.72,0.71
1,0,2,772,Lee Martin and Escobar,74335.27,1.87,1.02,1.29,1.19,-0.85,0.27,-0.1
2,0,3,479,Hobbs Mcdaniel and Baker,48746.22,1.21,0.7,1.04,2.12,-0.51,0.34,1.08
3,0,4,1692,Williams-Harris,64416.7,0.75,2.08,2.4,2.02,1.33,0.32,-0.38
4,0,5,2578,Beck-Snyder,71623.2,2.33,0.66,1.97,1.6,-1.67,1.31,-0.37


In [11]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df['churned'].value_counts())

Number of rows in dataset: 2999
0    2833
1     166
Name: churned, dtype: int64


## Part 2: Get the data into the right shape

In [32]:
columns = df.columns.tolist()
encoded_data = df.drop(['id', 'customer_code', 'co_name'], axis=1)
encoded_data.head()

Unnamed: 0,churned,total_spend,week_minus_4,week_minus_3,week_minus_2,last_week,4-3_delta,3-2_delta,2-1_delta
0,0,68567.34,0.81,0.02,0.74,1.45,-0.79,0.72,0.71
1,0,74335.27,1.87,1.02,1.29,1.19,-0.85,0.27,-0.1
2,0,48746.22,1.21,0.7,1.04,2.12,-0.51,0.34,1.08
3,0,64416.7,0.75,2.08,2.4,2.02,1.33,0.32,-0.38
4,0,71623.2,2.33,0.66,1.97,1.6,-1.67,1.31,-0.37


## Part 3: Create training, validation and test data sets

In [33]:
y = encoded_data['churned']
train_df, test_and_val_data, _, _ = train_test_split(encoded_data, y, test_size=0.3, stratify=y, random_state=0)

y = test_and_val_data['churned']
val_df, test_df, _, _ = train_test_split(test_and_val_data, y, test_size=0.333, stratify=y, random_state=0)

print(train_df.shape, val_df.shape, test_df.shape)
print()
print('Number of rows in Train dataset: {train_df.shape[0]}')
print(train_df['churned'].value_counts())
print()
print('Number of rows in Validate dataset: {val_df.shape[0]}')
print(val_df['churned'].value_counts())
print()
print('Number of rows in Test dataset: {test_df.shape[0]}')
print(test_df['churned'].value_counts())

(2099, 9) (600, 9) (300, 9)

Number of rows in Train dataset: {train_df.shape[0]}
0    1983
1     116
Name: churned, dtype: int64

Number of rows in Validate dataset: {val_df.shape[0]}
0    567
1     33
Name: churned, dtype: int64

Number of rows in Test dataset: {test_df.shape[0]}
0    283
1     17
Name: churned, dtype: int64


In [15]:
train_data = train_df.to_csv(f's3://{data_bucket}/{subfolder}/processed/train.csv', header=False, index=False)
val_data = val_df.to_csv(f's3://{data_bucket}/{subfolder}/processed/val.csv', header=False, index=False)
test_data = test_df.to_csv(f's3://{data_bucket}/{subfolder}/processed/test.csv', header=True, index=False)


#create train_input, val_input with TrainningInput wrapper 
train_input = sagemaker.inputs.TrainingInput(s3_data=f's3://{data_bucket}/{subfolder}/processed/train.csv', content_type='csv')
val_input = sagemaker.inputs.TrainingInput(s3_data=f's3://{data_bucket}/{subfolder}/processed/val.csv', content_type='csv')    

## Part 4: Train the model

In [20]:
sess = sagemaker.Session()

container = sagemaker.image_uris.retrieve(
                region=boto3.Session().region_name,
                framework='xgboost',
                version='latest')

estimator = sagemaker.estimator.Estimator(
                        container, 
                        role,
                        instance_count=1, 
                        instance_type='ml.m5.xlarge',
                        output_path=f's3://{data_bucket}/{subfolder}/output',
                        sagemaker_session=sess)

estimator.set_hyperparameters(
                        max_depth=3,
                        subsample=0.7,
                        objective='binary:logistic',
                        eval_metric='auc',
                        num_round=100,
                        early_stopping_rounds=10,
                        scale_pos_weight=17)

estimator.fit({'train': train_input, 'validation': val_input})

2022-02-08 03:18:37 Starting - Starting the training job...
2022-02-08 03:19:02 Starting - Launching requested ML instancesProfilerReport-1644290317: InProgress
......
2022-02-08 03:20:02 Starting - Preparing the instances for training.........
2022-02-08 03:21:35 Downloading - Downloading input data
2022-02-08 03:21:35 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-02-08:03:21:36:INFO] Running standalone xgboost training.[0m
[34m[2022-02-08:03:21:36:INFO] File size need to be processed in the node: 0.12mb. Available memory size in the node: 8023.84mb[0m
[34m[2022-02-08:03:21:36:INFO] Determined delimiter of CSV input is ','[0m
[34m[03:21:36] S3DistributionType set as FullyReplicated[0m
[34m[03:21:36] 2099x8 matrix with 16792 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-02-08:03:21:36:INFO] Determined delimiter of CSV input is ','[0m
[34m[03:21:36] S3DistributionType

## Part 5: Host the model

In [34]:
endpoint_name = 'customer-churn'

try:
    predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)
    predictor.endpoint_context()
    sess.delete_endpoint(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    print('Waiting 10 seconds ...')
    sleep(10)
except:
    print(f'endpoint: {endpoint_name} not exits')
    pass

endpoint: customer-churn not exits


In [24]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer

predictor = estimator.deploy(
                initial_instance_count=1,
                instance_type='ml.m5.xlarge', 
                endpoint_name=endpoint_name,
                deserializer = JSONDeserializer(),
                serializer = CSVSerializer())


-----!

## Part 6: Test the model

In [26]:
def get_prediction(row):
    prob = float(predictor.predict(row[1:]))
    return 1 if prob > 0.5 else 0


test_data = pd.read_csv(f's3://{data_bucket}/{subfolder}/processed/test.csv')

test_data['prediction'] = test_data.apply(get_prediction, axis=1)
test_data[:10]

Unnamed: 0,churned,total_spend,week_minus_4,week_minus_3,week_minus_2,last_week,4-3_delta,3-2_delta,2-1_delta,prediction
0,0,76897.46,0.56,2.29,1.14,2.23,1.73,-1.15,1.09,0
1,0,19604.63,1.95,2.04,0.82,1.62,0.09,-1.22,0.8,0
2,0,23369.6,1.11,1.54,1.55,1.14,0.43,0.01,-0.41,0
3,1,40709.47,2.4,1.87,0.07,0.61,-0.53,-1.8,0.54,1
4,0,69953.52,2.01,1.2,1.05,1.41,-0.81,-0.15,0.36,0
5,0,71939.07,0.54,1.17,0.21,2.29,0.63,-0.96,2.08,0
6,0,45930.53,0.08,1.43,0.41,1.34,1.35,-1.02,0.93,0
7,0,47080.25,1.54,0.68,0.8,0.54,-0.86,0.12,-0.26,0
8,0,35506.83,1.37,0.93,1.7,0.67,-0.44,0.77,-1.03,0
9,0,39188.12,0.4,1.86,0.1,0.82,1.46,-1.76,0.72,0


In [27]:
print(test_data['churned'].value_counts())
print(test_data['prediction'].value_counts())
print(metrics.accuracy_score(test_data['churned'],test_data['prediction']))

0    283
1     17
Name: churned, dtype: int64
0    267
1     33
Name: prediction, dtype: int64
0.9466666666666667


In [28]:
print(metrics.confusion_matrix(test_data['churned'],test_data['prediction']))

[[267  16]
 [  0  17]]


In [29]:
y = [1,0,0,0,0,0,0,0,0,2]
pred = [0,0,0,0,0,0,0,0,1,2]
print(metrics.confusion_matrix(y,pred))

[[7 1 0]
 [1 0 0]
 [0 0 1]]


## Remove the Endpoint (optional)
Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [30]:
sess.delete_endpoint(endpoint_name)