# Predicting Customer Churn

In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

### Load the Data into Pandas

> **Note:** Since the features are all numerical, there is no need to perform any data encoding tasks.

In [2]:
# Load the CSV data into a DataFrame
file_path = Path("Resources/customer_churn.csv")
df = pd.read_csv(file_path)

# Display sample data
df.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,107,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
1,137,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
2,84,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
3,75,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0
4,121,0,1,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,0


## Preprocess Data

### Create the features and rarget sets

The features set will be all the columns from the original DataFrame except the `Class` column that constitutes the target set.

In [3]:
# Creating the features set X
X = df.drop(columns=["churn"])

# Display sample data
X.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,107,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
1,137,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
2,84,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
3,75,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3
4,121,0,1,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3


In [4]:
# Creating the target set y
y = df["churn"]

# Display sample data
y.head()

0    0
1    0
2    0
3    0
4    0
Name: churn, dtype: int64

### Split the features and target sets into training and testing datasets

In [5]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Use the Scikit-Learn’s `StandardScaler` to scale the features data

In [6]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Create a Machine Learning Model in SageMaker Studio

### Importing the required libraries

In [7]:
# Import Amazon SageMaker libraries and modules
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer, json_deserializer

# Import AWS Python SDK
import boto3

# Import support libraries
import io
import os
import json
import numpy as np

### Configure general settings for the SageMaker model

In [8]:
# Set the S3 bucket name
bucket = "fintech-bootcamp-activities-jams-2021-02-11"

In [9]:
# Set a prefix for the data files
prefix = "customer-churn"

In [10]:
# Set the IAM execution role
role = get_execution_role()

### Upload the training and testing data to Amazon S3

#### Encode and upload the training data

In [11]:
# Encode the training data as Protocol Buffer
buf = io.BytesIO()
vectors = np.array(X_train).astype("float32")
labels = np.array(y_train).astype("float32")
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

# Upload encoded training data to Amazon S3
key = 'linear_train.data'
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = "s3://{}/{}/train/{}".format(bucket, prefix, key)
print("Training data uploaded to: {}".format(s3_train_data))

Training data uploaded to: s3://fintech-bootcamp-activities-jams-2021-02-11/customer-churn/train/linear_train.data


#### Encode and upload the testing data

In [12]:
# Encode the testing data as Protocol Buffer
buf = io.BytesIO()
vectors = np.array(X_test).astype("float32")
labels = np.array(y_test).astype("float32")
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

# Upload encoded testing data to Amazon S3
key = "linear_test.data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "test", key)).upload_fileobj(buf)
s3_test_data = "s3://{}/{}/test/{}".format(bucket, prefix, key)
print("Testing data uploaded to: {}".format(s3_test_data))

Testing data uploaded to: s3://fintech-bootcamp-activities-jams-2021-02-11/customer-churn/test/linear_test.data


### Specify the Amazon SageMaker session to use

In [13]:
# Save the current session in a variable
sess = sagemaker.Session()

### Create an instance of the machine learning model

In [14]:
# Import the get_image_uri module from the sagamaker library
from sagemaker.amazon.amazon_estimator import get_image_uri

In [15]:
# Import the container image
container = get_image_uri(boto3.Session().region_name, "linear-learner")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


In [16]:
# Create an instance of the machine learning model
linear = sagemaker.estimator.Estimator(
    container,
    role,
    train_instance_count=1,
    train_instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, prefix),
    sagemaker_session=sess,
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


### Define Linear Learner hyperparameters

In [17]:
# Get the dimension of the feature-input vector
feature_dim = X.shape[1]

In [18]:
# Define linear learner hyperparameters
linear.set_hyperparameters(
    feature_dim=feature_dim,
    mini_batch_size=200,
    predictor_type="binary_classifier"
)

## Fit the Machine Learning Model in SageMaker Studio

Use the fit function of the model to train it using the train and testing data stored in the Amazon S3 bucket.

In [19]:
# Fitting the linear learner model
linear.fit({"train": s3_train_data, "test": s3_test_data})

2021-03-09 17:10:49 Starting - Starting the training job...
2021-03-09 17:11:12 Starting - Launching requested ML instancesProfilerReport-1615309849: InProgress
......
2021-03-09 17:12:13 Starting - Preparing the instances for training.........
2021-03-09 17:13:34 Downloading - Downloading input data...
2021-03-09 17:14:15 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/09/2021 17:14:29 INFO 140498712934208] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto',

## Make Predictions With the Model in SageMaker Studio

### Deploy the model

In [20]:
# Deploy an instance of the linear-learner model to create a predictor
linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

-----------------!

### Setting configurations for the predictor

In [21]:
# Linear predictor configurations
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

### Make Predictions Using Testing Data

#### Use the `predict` function of the predictor to make predictions using the testing data stored in Pandas

In [22]:
# Making some predictions using the test data
model_predictions = linear_predictor.predict(X_test_scaled)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [23]:
# Display sample predictions
model_predictions["predictions"][:3]

[{'score': 2.2269659893936478e-05, 'predicted_label': 0},
 {'score': 0.00014382405788637698, 'predicted_label': 0},
 {'score': 3.6878966056974605e-05, 'predicted_label': 0}]

#### Creating a list of the predicted values

In [24]:
# Create a list with the predicted values
y_predictions = [np.uint8(value["predicted_label"]) for value in model_predictions["predictions"]]

# Transforming the list into an array
y_predictions = np.array(y_predictions)

# Display sample data
y_predictions[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

## Evaluate the Machine Model

Use the `classification_report` module from Scikit-learn to assess the performance of the model to predict fraudulent credit card transactions.

In [25]:
# Import the classification report from Scikit-learn
from sklearn.metrics import classification_report

In [26]:
# Display classification report
print("Classification report")
print(classification_report(y_test, y_predictions))

Classification report
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       909
           1       0.00      0.00      0.00       154

    accuracy                           0.86      1063
   macro avg       0.43      0.50      0.46      1063
weighted avg       0.73      0.86      0.79      1063



  _warn_prf(average, modifier, msg_start, len(result))


## Delete the End-Point to Avoid Additional AWS Resources Usage and Billing

Make sure that you delete all the Amazon SageMaker endpoints to prevent unwanted charges.

In [27]:
# Delete Amazon SageMaker end-point
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
