# Churn automl 
with batch transform

todos - 
* align the user ids and figure if column headers are allowed in the test data or if we have to add them later
* also whether y column can/can't be in test data. 
* add clarify job
* clean the directories so all in the same project, and in sub directories inputs/training/test, inference/output.


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import boto3
import sagemaker  
from sagemaker.automl.automl import AutoML
from time import gmtime, strftime, sleep
from sagemaker import get_execution_role




In [2]:
sm = boto3.client('sagemaker')
session = sagemaker.Session()
bucket = session.default_bucket()
role = get_execution_role()


In [3]:
auto_ml_job_name = 'automl-churn' # This will be the s3 bucket name so dashes are ok but underscores are not 
prefix = 'sagemaker/' + auto_ml_job_name
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
base_job_name = 'automl-card-churn' #redundant 
target_attribute_name = 'churn_flag'
model_name = 'automl-churn-model-' + timestamp_suffix
transform_job_name = "automl-churn-model-transform"

test_data_s3_path = f's3://{bucket}/{prefix}/test/'
s3_transform_output_path = f's3://{bucket}/{prefix}/inference-results/'



# Step 1 - Get datasets 
Not a sagemaker task - this should ordinarily be done in a separate preprocessing step to e.g. pull data from the Lake or Database.
The test set in this case is used to demonstrate how to score

In [4]:
import os

In [5]:
os.chdir('/root/churn')

In [15]:
#!apt install unzip
#!wget http://dataminingconsultant.com/DKD2e_data_sets.zip
#!unzip -o DKD2e_data_sets.zip


In [16]:
churn = pd.read_csv('./raw/churn.txt')

In [17]:
churn.shape

(3333, 21)

In [18]:
#There is no explict customer id- so lets use tel number as a proxy
churn.rename(columns={'Phone': 'cust_id'}, inplace = True)

In [19]:
# the target column is not well formed - lets turn it to an indicator flag instead

churn['churn_flag'] = np.where(churn['Churn?']=='False.', 0, 1)
churn.drop('Churn?', axis='columns', inplace=True)
churn['churn_flag'].value_counts()


0    2850
1     483
Name: churn_flag, dtype: int64

In [20]:
churn.head()

Unnamed: 0,State,Account Length,Area Code,cust_id,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,churn_flag
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [21]:
y = churn['churn_flag']
X = churn.drop('churn_flag', axis='columns')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [22]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2233, 20), (1100, 20), (2233,), (1100,))

In [6]:
train_file = 'train/train_data.csv';
test_file = 'test/test_data.csv';

In [None]:
#Couple of changes requred - drop the customer id column from the training dataset,
and for the test datatet, put it as the first or last column

In [24]:
training_data = pd.DataFrame(X_train)
training_data['churn_flag'] = list(y_train)
test_data = pd.DataFrame(X_test)

training_data.to_csv(train_file, index=False, header=True)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)


test_data.to_csv(test_file, index=False, header=True)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)


Train data uploaded to: s3://sagemaker-eu-west-1-114936231890/sagemaker/automl-churn/train/train_data.csv
Test data uploaded to: s3://sagemaker-eu-west-1-114936231890/sagemaker/automl-churn/test/test_data.csv


# Step 2 - Training Job

In [8]:
# input_data_config = [{
#       'DataSource': {
#         'S3DataSource': {
#           'S3DataType': 'S3Prefix',
#           'S3Uri': 's3://{}/{}/train'.format(bucket,prefix)
#         }
#       },
#       'TargetAttributeName': 'Class'
#     }
#   ]
automl = AutoML(role=role,
                target_attribute_name=target_attribute_name,
                base_job_name=auto_ml_job_name,
                sagemaker_session=session,
                problem_type='BinaryClassification',
                job_objective={'MetricName': 'AUC'},
                max_candidates=10)                

In [None]:
automl.fit(train_file, job_name=auto_ml_job_name, wait=False, logs=False)
describe_response = automl.describe_auto_ml_job()
print (describe_response)
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = automl.describe_auto_ml_job()
    job_run_status = describe_response['AutoMLJobStatus']
    print (job_run_status)
    sleep(30)
print ('completed')

In [None]:
best_candidate = automl.describe_auto_ml_job()['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))



----
*Note* we are taking the best candidate forward for production, but we could equally choose one of the other models generated

# Step 3 -  Explainability
Run AWS clarify to get global shap values



In [12]:
from sagemaker import clarify
clarify_processor = clarify.SageMakerClarifyProcessor(role=role,
                                                      instance_count=1,
                                                      instance_type='ml.c4.xlarge',
                                                      sagemaker_session=session)


model_config =  clarify.ModelConfig(model_name=auto_ml_job_name,
                                   instance_type='ml.c5.xlarge',
                                   instance_count=1,
                                   accept_type='text/csv')

In [7]:

test_features = pd.read_csv(test_file)
training_data = pd.read_csv(train_file)
                            
test_features.columns


Index(['State', 'Account Length', 'Area Code', 'cust_id', 'Int'l Plan',
       'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge',
       'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls',
       'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge',
       'CustServ Calls'],
      dtype='object')

In [22]:
training_data.columns.to_list()

['State',
 'Account Length',
 'Area Code',
 'cust_id',
 "Int'l Plan",
 'VMail Plan',
 'VMail Message',
 'Day Mins',
 'Day Calls',
 'Day Charge',
 'Eve Mins',
 'Eve Calls',
 'Eve Charge',
 'Night Mins',
 'Night Calls',
 'Night Charge',
 'Intl Mins',
 'Intl Calls',
 'Intl Charge',
 'CustServ Calls',
 'churn_flag']

In [9]:
test_features.iloc[0].values.tolist()

['IL',
 100,
 415,
 '420-6121',
 'no',
 'no',
 0,
 191.9,
 95,
 32.62,
 200.9,
 101,
 17.08,
 271.9,
 74,
 12.24,
 18.2,
 3,
 4.91,
 1]

In [13]:
shap_config = clarify.SHAPConfig(baseline=[test_features.iloc[0].values.tolist()],
                                 num_samples=15,
                                 agg_method='mean_abs')

In [18]:
train_uri = f's3://{bucket}/{prefix}/train/train_data.csv'
explainability_output_path = f's3://{bucket}/{prefix}/exaplainability/'



In [19]:
train_uri

's3://sagemaker-eu-west-1-114936231890/sagemaker/automl-churn/train/train_data.csv'

In [20]:
explainability_output_path = f's3://{bucket}/{prefix}/clarify-explainability'
explainability_data_config = clarify.DataConfig(s3_data_input_path=train_uri,
                                s3_output_path=explainability_output_path,
                                label='churn_flag',
                                headers=training_data.columns.to_list(),
                                dataset_type='text/csv')

In [23]:
shap_config = clarify.SHAPConfig(baseline=[test_features.iloc[0].values.tolist()],
                                 num_samples=15,
                                 agg_method='mean_abs')

In [24]:
clarify_processor.run_explainability(data_config=explainability_data_config,
                                     model_config=model_config,
                                     explainability_config=shap_config)

TypeError: Object of type int64 is not JSON serializable

# Step 4 - Batch inference on Test Set
This could be a separate scoring script.

Tasks are
* define the predictive columns we need (probability scores not output by default) 
* join the scores back to the input data

In [25]:
automl = AutoML.attach(auto_ml_job_name)
best_candidate = automl.describe_auto_ml_job()['BestCandidate']
best_candidate_name = best_candidate['CandidateName']

In [26]:
best_candidate

{'CandidateName': 'tuning-job-1-88a84eb8f52f469399-001-39615d99',
 'FinalAutoMLJobObjectiveMetric': {'MetricName': 'validation:auc',
  'Value': 1.0},
 'ObjectiveStatus': 'Succeeded',
 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob',
   'CandidateStepArn': 'arn:aws:sagemaker:eu-west-1:114936231890:processing-job/db-1-c67b469b57254972bee8f74e7c5ceb60d4db620d19134696ac408ffc62',
   'CandidateStepName': 'db-1-c67b469b57254972bee8f74e7c5ceb60d4db620d19134696ac408ffc62'},
  {'CandidateStepType': 'AWS::SageMaker::TrainingJob',
   'CandidateStepArn': 'arn:aws:sagemaker:eu-west-1:114936231890:training-job/automl-chu-dpp6-1-fd45a620af5445e0869bd0d134534a0bd9c70456df184',
   'CandidateStepName': 'automl-chu-dpp6-1-fd45a620af5445e0869bd0d134534a0bd9c70456df184'},
  {'CandidateStepType': 'AWS::SageMaker::TransformJob',
   'CandidateStepArn': 'arn:aws:sagemaker:eu-west-1:114936231890:transform-job/automl-chu-dpp6-rpb-1-cce38d3188274c26b3c15e9ac2d6d671ae1d87571',
   'Candidat

In [28]:
inference_response_keys = ['predicted_label', 'probability']
model = automl.create_model(name=best_candidate_name,
candidate=best_candidate,inference_response_keys=inference_response_keys)

In [29]:
output_path = s3_transform_output_path + best_candidate['CandidateName'] +'/'
transformer=model.transformer(instance_count=1, 
                          instance_type='ml.m5.xlarge',
                          assemble_with='Line',
                          accept='text/csv', 
                          output_path=output_path)
transformer.transform(job_name=transform_job_name+"3", 
                      data=test_data_s3_path, 
                      split_type='Line', 
                      input_filter="$", join_source= "Input", 
                      output_filter="$",
                      content_type='text/csv', 
                     
                      wait=False)

describe_response = sm.describe_transform_job(TransformJobName = transform_job_name+"3")
job_run_status = describe_response['TransformJobStatus']
print (job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_transform_job(TransformJobName = transform_job_name+"3")
    job_run_status = describe_response['TransformJobStatus']
    print (describe_response)
    sleep(30)
print ('transform job completed with status : ' + job_run_status)

InProgress
{'TransformJobName': 'automl-churn-model-transform3', 'TransformJobArn': 'arn:aws:sagemaker:eu-west-1:114936231890:transform-job/automl-churn-model-transform3', 'TransformJobStatus': 'InProgress', 'ModelName': 'tuning-job-1-88a84eb8f52f469399-001-39615d99', 'TransformInput': {'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-eu-west-1-114936231890/sagemaker/automl-churn/test/'}}, 'ContentType': 'text/csv', 'CompressionType': 'None', 'SplitType': 'Line'}, 'TransformOutput': {'S3OutputPath': 's3://sagemaker-eu-west-1-114936231890/sagemaker/automl-churn/inference-results/tuning-job-1-88a84eb8f52f469399-001-39615d99/', 'Accept': 'text/csv', 'AssembleWith': 'Line', 'KmsKeyId': ''}, 'TransformResources': {'InstanceType': 'ml.m5.xlarge', 'InstanceCount': 1}, 'CreationTime': datetime.datetime(2021, 1, 30, 17, 0, 34, 79000, tzinfo=tzlocal()), 'DataProcessing': {'InputFilter': '$', 'OutputFilter': '$', 'JoinSource': 'Input'}, 'ResponseMetadata': {'Requ

In [30]:
output_path

's3://sagemaker-eu-west-1-114936231890/sagemaker/automl-churn/inference-results/tuning-job-1-88a84eb8f52f469399-001-39615d99/'

In [31]:
import json
import io
from urllib.parse import urlparse
test_file = 'test_data.csv';

def get_csv_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:].strip('/')
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket_name, '{}/{}'.format(prefix, file_name))
    return obj.get()["Body"].read().decode('utf-8')    
pred_csv = get_csv_from_s3(transformer.output_path, '{}.out'.format(test_file))
data_auc=pd.read_csv(io.StringIO(pred_csv), header=None)


NoSuchKey: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.

In [37]:
data_auc.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,-1.940299,1.298392,-1.244502,-0.063548,0.478876,-0.113497,-0.081473,-1.928193,0.842953,-0.717695,...,-0.046159,0.114589,1.99,-0.120358,-0.842826,-0.535953,0.077028,1.645834,0,0.000314
1,-0.431533,0.098011,-0.527976,0.278634,0.590102,0.120657,-0.124413,-0.137304,0.329442,0.020742,...,0.111203,0.279081,10.99,-0.209558,-0.148833,-0.303565,0.033845,1.642678,0,2e-06
2,1.145541,-1.497131,1.306303,0.028084,-0.515357,-0.361485,-0.528184,-0.040668,1.7313,-1.787512,...,0.037527,0.107572,64.0,-0.279907,-2.051948,0.140723,0.126333,0.354147,0,0.000218
3,-0.761075,0.407323,0.046951,0.248012,1.01805,-0.036275,-0.259689,-0.087672,-0.412512,-0.099484,...,0.097562,0.097415,2.58,-0.412938,-0.584906,-0.488248,0.067834,0.299712,0,1.7e-05
4,-0.493221,-0.259927,1.04895,0.02804,-0.017564,-0.151642,-0.023132,-0.162147,-0.761026,-0.563407,...,0.012147,0.363948,38.44,0.042148,1.355336,0.580501,-0.071353,0.472779,0,0.096856


In [None]:
# Step 5 - Post processing
Typically the probability scores are bucketed into quintiles for more practical usage

In [59]:
data_auc['decile'] = pd.qcut(data_auc.iloc[:,31], 10, labels=False)