# Initialize stuff for project

In [1]:
# import all required libraries
import sagemaker # Amazon Sagemaker Python SDK (needed for using prebuilt Sagemaker models/algos)
import boto3     # AWS Python SDK(helps us read data from S3 buckets, just like pandas helps us read from CSVs, etc.)
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session

In [6]:
# initialize S3 bucket name and detect region for creation
bucket_name = "rzapplication1" # simply the bucket name for our application, can be IAM controlled/restricted down the road
my_region = boto3.session.Session().region_name
print(my_region)              # we check our region name, for use with S3 bucket that is created

us-west-2


In [7]:
# create S3 bucket programmatically
s3 = boto3.resource("s3")
try: 
    if my_region == "us-west-2":
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': 'us-west-2'})
        print("S3 bucket '",  bucket_name,"' created successfully!")
except Exception as e:
    print("S3 error: ", e)

S3 bucket ' rzapplication1 ' created successfully!


In [8]:
# mapping for storage in S3 bucket
model_name_prefix = "sagemaker_xgboost_model"
model_write_path = "s3://{}/{}/output".format(bucket_name, model_name_prefix)
print(model_write_path)

s3://rzapplication1/sagemaker_xgboost_model/output


# Get data for training/testing our model

In [9]:
# load additional libraries
import pandas as pd
import urllib 

In [10]:
# download the dataset via specified URL
try:
    urllib.request.urlretrieve("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", 
                                "bank_clean.csv")
    print("Success: downloaded bank_clean.csv.")
except Exception as e:
    print("Data download error: ", e) 

Success: downloaded bank_clean.csv.


In [11]:
# read that data into a pandas dataframe now
try:
    model_data = pd.read_csv("./bank_clean.csv")
    print("Success: downloaded file successfully loaded as a pandas dataframe!")
except Exception as e:
    print("Pandas load error: ", e)

Success: downloaded file successfully loaded as a pandas dataframe!


In [14]:
# do the train-test split of our downloaded dataset, into a 70-30 ratio
import numpy as np
train_data, test_data = np.split(model_data.sample(frac = 1, random_state = 101), [int(0.7 * len(model_data))])
print("Shape of train data: ", train_data.shape)
print("Shape of test data: ", test_data.shape)

Shape of train data:  (28831, 62)
Shape of test data:  (12357, 62)


In [18]:
# save the train data into S3 bucket
import os

# convert into the data format required by Sagemaker, with response first and then the predictors
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', 
                                                                                            index=False, 
                                                                                            header=False)

# upload to S3 bucket we created above
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(model_name_prefix, 'train/train.csv')).upload_file('train.csv')

# load the object with reference to this newly uploaded train dataset
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket_name, model_name_prefix), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [19]:
# save the test data into S3 bucket
import os

# convert into the data format required by Sagemaker, with response first and then the predictors
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('test.csv', 
                                                                                            index=False, 
                                                                                            header=False)

# upload to S3 bucket we created above
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(model_name_prefix, 'test/test.csv')).upload_file('test.csv')

# load the object with reference to this newly uploaded test dataset
s3_input_test = sagemaker.s3_input(s3_data='s3://{}/{}/test'.format(bucket_name, model_name_prefix), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


# Build XGBoost model using Sagemaker algo and data from created S3 bucket

In [22]:
# the Sagemaker pre-built algos are present as containers, and we need to pull them into our current AS instance as follows
