In [339]:
import os
import boto3
import re
import sagemaker

In [340]:
role=sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket = 'customers-create-bucket'
prefix = (
    "churn-prediction"  # place to upload training files within the bucket
)

In [341]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import io
import time
import json
import sagemaker.amazon.common as smac

In [342]:
s3 = boto3.client("s3")
filename = 'Telco-Customer-Churn.csv'
s3.download_file(Bucket="customers-create-bucket", Key="churn-prediction/Telco-Customer-Churn.csv", Filename=filename)
data = pd.read_csv(filename)
data.shape

(7043, 21)

In [348]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [344]:
data.loc[data.Churn=='No','Churn']=0
data.loc[data.Churn=='Yes','Churn']=1

In [345]:
cols = ['OnlineBackup', 'StreamingMovies','DeviceProtection',
                'TechSupport','OnlineSecurity','StreamingTV']
for i in cols : 
    data[i]  = data[i].replace({'No internet service' : 'No'})

In [346]:
data['TotalCharges'] = data["TotalCharges"].replace(" ", np.nan)
data = data[data["TotalCharges"].notnull()]
data = data.reset_index()[data.columns]
data["TotalCharges"] = data["TotalCharges"].astype(float)

In [347]:
data["Churn"].value_counts().values

array([5163, 1869])

In [333]:
display(data.head())
display(data.describe())

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,Contract_One year,Contract_Two year,Dependents_Yes,DeviceProtection_Yes,gender_Male,...,PaperlessBilling_Yes,Partner_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneService_Yes,SeniorCitizen_1,StreamingMovies_Yes,StreamingTV_Yes,TechSupport_Yes
0,7590-VHVEG,1,29.85,29.85,0,False,False,False,False,False,...,True,True,False,True,False,False,False,False,False,False
1,5575-GNVDE,34,56.95,1889.50,0,True,False,False,True,True,...,False,False,False,False,True,True,False,False,False,False
2,3668-QPYBK,2,53.85,108.15,1,False,False,False,False,True,...,True,False,False,False,True,True,False,False,False,False
3,7795-CFOCW,45,42.30,1840.75,0,True,False,False,True,True,...,False,False,False,False,False,False,False,False,False,True
4,9237-HQITU,2,70.70,151.65,1,False,False,False,False,False,...,True,False,False,True,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,6840-RESVB,24,84.80,1990.50,0,True,False,True,True,True,...,True,True,False,False,True,True,False,True,True,True
7028,2234-XADUH,72,103.20,7362.90,0,True,False,True,True,False,...,True,True,True,False,False,True,False,True,True,False
7029,4801-JZAZL,11,29.60,346.45,0,False,False,True,False,False,...,True,True,False,True,False,False,False,False,False,False
7030,8361-LTMKD,4,74.40,306.60,1,False,False,False,False,True,...,True,True,False,False,True,True,True,False,False,False


Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,7032.0,7032.0,7032.0
mean,32.421786,64.798208,2283.300441
std,24.54526,30.085974,2266.771362
min,1.0,18.25,18.8
25%,9.0,35.5875,401.45
50%,29.0,70.35,1397.475
75%,55.0,89.8625,3794.7375
max,72.0,118.75,8684.8


In [351]:
data.nunique()

customerID          7032
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                72
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         2
OnlineBackup           2
DeviceProtection       2
TechSupport            2
StreamingTV            2
StreamingMovies        2
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1584
TotalCharges        6530
Churn                  2
dtype: int64

In [352]:
target_instance = data["Churn"].value_counts().to_frame()
target_instance = target_instance.reset_index()
fig = px.pie(target_instance, values='count', names='Churn', color_discrete_sequence=["green", "red"],
             title='Distribution of Churn')
fig.show()

In [279]:
data = pd.get_dummies(data, columns = ['Contract','Dependents','DeviceProtection','gender',
                                                        'InternetService','MultipleLines','OnlineBackup',
                                                        'OnlineSecurity','PaperlessBilling','Partner',
                                                        'PaymentMethod','PhoneService','SeniorCitizen',
                                                        'StreamingMovies','StreamingTV','TechSupport'],
                              drop_first=True)
data

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,Contract_One year,Contract_Two year,Dependents_Yes,DeviceProtection_Yes,gender_Male,...,PaperlessBilling_Yes,Partner_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneService_Yes,SeniorCitizen_1,StreamingMovies_Yes,StreamingTV_Yes,TechSupport_Yes
0,7590-VHVEG,1,29.85,29.85,0,False,False,False,False,False,...,True,True,False,True,False,False,False,False,False,False
1,5575-GNVDE,34,56.95,1889.50,0,True,False,False,True,True,...,False,False,False,False,True,True,False,False,False,False
2,3668-QPYBK,2,53.85,108.15,1,False,False,False,False,True,...,True,False,False,False,True,True,False,False,False,False
3,7795-CFOCW,45,42.30,1840.75,0,True,False,False,True,True,...,False,False,False,False,False,False,False,False,False,True
4,9237-HQITU,2,70.70,151.65,1,False,False,False,False,False,...,True,False,False,True,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,6840-RESVB,24,84.80,1990.50,0,True,False,True,True,True,...,True,True,False,False,True,True,False,True,True,True
7028,2234-XADUH,72,103.20,7362.90,0,True,False,True,True,False,...,True,True,True,False,False,True,False,True,True,False
7029,4801-JZAZL,11,29.60,346.45,0,False,False,True,False,False,...,True,True,False,True,False,False,False,False,False,False
7030,8361-LTMKD,4,74.40,306.60,1,False,False,False,False,True,...,True,True,False,False,True,True,True,False,False,False


In [280]:
rand_split = np.random.rand(len(data))

train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = data[train_list]
data_val = data[val_list]
data_test = data[test_list]

train_y = data_train['Churn'].to_numpy()
train_x = data_train.drop(['Churn','customerID'], axis=1).to_numpy()

val_y = data_val['Churn'].to_numpy()
val_x = data_val.drop(['Churn','customerID'], axis=1).to_numpy()

test_y = data_test['Churn'].to_numpy()
test_x = data_test.drop(['Churn','customerID'], axis=1).to_numpy()

In [281]:
train_file = "liner_train.data"

f = io.BytesIO()

smac.write_numpy_to_dense_tensor(f, train_x.astype("float32"), train_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train", train_file)
).upload_fileobj(f)

In [282]:
validation_file = "linear_validation.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_x.astype("float32"), val_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation", validation_file)
).upload_fileobj(f)

In [287]:
from sagemaker import image_uris
container = image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")

In [288]:
linear_job = "Demo-Customer-Churn-Prediction-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.c4.2xlarge", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/".format(bucket, prefix)},
    "HyperParameters": {
        "feature_dim": "24",
        "mini_batch_size": "100",
        "predictor_type": "regressor",
        "epochs": "10",
        "num_models": "32",
        "loss": "absolute_loss",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

Job name is: Demo-Customer-Churn-Prediction-2023-07-17-18-05-27


In [289]:
%%time

region = boto3.Session().region_name
sm = boto3.client("sagemaker")

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)["TrainingJobStatus"]
print(status)
sm.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=linear_job)
if status == "Failed":
    message = sm.describe_training_job(TrainingJobName=linear_job)["FailureReason"]
    print("Training failed with the following error: {}".format(message))
    raise Exception("Training job failed")

InProgress
CPU times: user 103 ms, sys: 125 µs, total: 103 ms
Wall time: 4min


In [291]:
linear_hosting_container = {
    "Image": container,
    "ModelDataUrl": sm.describe_training_job(TrainingJobName=linear_job)["ModelArtifacts"][
        "S3ModelArtifacts"
    ],
}

create_model_response = sm.create_model(
    ModelName=linear_job, ExecutionRoleArn=role, PrimaryContainer=linear_hosting_container
)

print(create_model_response["ModelArn"])

arn:aws:sagemaker:ap-southeast-2:854272207879:model/demo-customer-churn-prediction-2023-07-17-18-05-27


In [298]:
linear_endpoint_config = "DEMO-Churn-Prediction-endpoint-config-" + time.strftime(
    "%Y-%m-%d-%H-%M-%S", time.gmtime()
)
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[
        {
            "InstanceType": "ml.m4.xlarge",
            "InitialInstanceCount": 1,
            "ModelName": linear_job,
            "VariantName": "AllTraffic",
        }
    ],
)
print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

DEMO-Churn-Prediction-endpoint-config-2023-07-17-18-22-54
Endpoint Config Arn: arn:aws:sagemaker:ap-southeast-2:854272207879:endpoint-config/demo-churn-prediction-endpoint-config-2023-07-17-18-22-54


In [299]:
%%time

linear_endpoint = "DEMO-Churn-Prediction-endpoint-" + time.strftime("%Y%m%d%H%M", time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint, EndpointConfigName=linear_endpoint_config
)
print(create_endpoint_response["EndpointArn"])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Status: " + status)

sm.get_waiter("endpoint_in_service").wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

if status != "InService":
    raise Exception("Endpoint creation did not succeed")

DEMO-Churn-Prediction-endpoint-202307171831
arn:aws:sagemaker:ap-southeast-2:854272207879:endpoint/demo-churn-prediction-endpoint-202307171831
Status: Creating
Arn: arn:aws:sagemaker:ap-southeast-2:854272207879:endpoint/demo-churn-prediction-endpoint-202307171831
Status: InService
CPU times: user 91.3 ms, sys: 2.58 ms, total: 93.8 ms
Wall time: 4min 1s


In [301]:
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    return csv.getvalue().decode().rstrip()

In [314]:
runtime = boto3.client("runtime.sagemaker")

payload = np2csv(test_x)
response = runtime.invoke_endpoint(
    EndpointName=linear_endpoint, ContentType="text/csv", Body=payload
)
result = json.loads(response["Body"].read().decode())
test_pred = np.array([r["score"] for r in result["predictions"]])

In [315]:
test_mae_linear = np.mean(np.abs(test_y - test_pred))
test_mae_baseline = np.mean(
    np.abs(test_y - np.median(train_y))
)  ## training median as baseline predictor

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear, 3))

Test MAE Baseline : 0.271
Test MAE Linear: 0.272


In [316]:
test_pred_class = (test_pred > 0.5) + 0
test_pred_baseline = np.repeat(np.median(train_y), len(test_y))

prediction_accuracy = np.mean((test_y == test_pred_class)) * 100
baseline_accuracy = np.mean((test_y == test_pred_baseline)) * 100

print("Prediction Accuracy:", round(prediction_accuracy, 1), "%")
print("Baseline Accuracy:", round(baseline_accuracy, 1), "%")

Prediction Accuracy: 72.9 %
Baseline Accuracy: 72.9 %


In [325]:
#sm.delete_endpoint(EndpointName=linear_endpoint)