## Set Up Copied From Challenge 2: 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
import sagemaker
from sagemaker.predictor import csv_serializer

In [2]:
#connect to s3 bucket

bucket = 'sagemaker-exercises'
prefix = 'sagemaker/DEMO-xgboost'

# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

In [3]:
churn = pd.read_csv('./Data sets/churn.txt')
pd.set_option('display.max_columns', 500)

In [4]:
#clean up df
churn = churn.drop('Phone', axis=1)
churn['Area Code'] = churn['Area Code'].astype(object)
churn = churn.drop(['Day Charge', 'Eve Charge', 'Night Charge', 'Intl Charge'], axis=1)

In [5]:
#one hot encode
model_data = pd.get_dummies(churn)
model_data = pd.concat([model_data['Churn?_True.'], model_data.drop(['Churn?_False.', 'Churn?_True.'], axis=1)], axis=1)

In [6]:
# train-validation-test split
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

In [7]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

### create and train model

In [8]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner') # switch to logistic regression

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [9]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='text/csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='text/csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [10]:
sess = sagemaker.Session()

lr = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)



In [11]:
#https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html

lr.set_hyperparameters(feature_dim= 69, 
                      epochs = 15, 
                      predictor_type = 'binary_classifier')

lr.fit({'train': s3_input_train, 'validation': s3_input_validation})

2020-07-10 19:01:34 Starting - Starting the training job...
2020-07-10 19:01:37 Starting - Launching requested ML instances.........
2020-07-10 19:03:21 Starting - Preparing the instances for training......
2020-07-10 19:04:34 Downloading - Downloading input data...
2020-07-10 19:05:01 Training - Downloading the training image...
2020-07-10 19:05:25 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[07/10/2020 19:05:30 INFO 139921038047040] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'in


2020-07-10 19:05:48 Uploading - Uploading generated training model
2020-07-10 19:05:48 Completed - Training job completed
Training seconds: 74
Billable seconds: 74


# Challenge 3 Begins Below: 

### Use the Amazon Sagemaker Batch Transform function to run a batch inference on the train.csv dataset

In [12]:
#deploy trained model
lr_predictor = lr.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')



-----------------!

In [13]:
#test deployed model (single instance)
lr_predictor.content_type = 'text/csv'
lr_predictor.serializer = csv_serializer
lr_predictor.deserializer = None

In [23]:
test_data.head(1).to_numpy()

array([[  0. , 186. ,   0. , 137.8,  97. , 187.7, 118. , 146.4,  85. ,
          8.7,   6. ,   1. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   1. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   1. ,   1. ,   0. ,   1. ,   0. ]])

In [24]:
#test deployed model (multiple instances)

def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, lr_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.to_numpy()[:, 1:])



In [26]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform.html

transformer = sagemaker.transformer.Transformer(base_transform_job_name = 'Batch-Transform', 
                                               model_name = "linear-learner-2020-07-10-19-01-34-351", #get from Inference->Models on web interface
                                               instance_count = 1, 
                                               instance_type = 'ml.m4.xlarge', 
                                               output_path = 's3://sagemaker-exercises/batchoutput')#create new path from s3 bucket

transformer.transform('s3://sagemaker-exercises/sagemaker/DEMO-xgboost/train', content_type = 'text/csv', split_type = 'Line')#input from s3_input_train.config