# Predicting Boston Housing Prices

### Using XGBoost in SageMaker (Batch Transform)

### 1. Setting up the notebook

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

- Amazon modules SageMaker that we will be used

In [5]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

- Create a session for the calculation

In [6]:
session = sagemaker.Session()
role = get_execution_role()

ValueError: Must setup local AWS configuration with a region supported by SageMaker.

### 2. Download the data

In [7]:
boston = load_boston()

### 3. Prepare the data

In [8]:
# Raw data
X_bos_pd = pd.DataFrame(boston.data, columns = boston.feature_names)
Y_bos_pd = pd.DataFrame(boston.target)

# we split the dataset into 2/3 training and 1/3 test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_bos_pd,
                                                    Y_bos_pd,
                                                    test_size = 0.33)

# we also use the training set and development set
X_train, X_val, Y_train, Y_val = train_test_split(X_train,
                                                  Y_train,
                                                  test_size = 0.33)

### 4. Upload the data file to S3


In [9]:
data_dir = '../data/boston'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [12]:
X_test.to_csv(os.path.join(data_dir, 'test_csv'), header= False, index=False)

pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

- Upload to S3

In [13]:
prefix = 'boston-xgboost-HL'

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

NameError: name 'session' is not defined

### 5. Train the XGBoost model

In [15]:
container = get_image_uri(session.boto_region_name, 'xgboost')

# get the xgboost model, and construct the estimator object
xgb = sagemaker.estimator.Estimator(container, # The image name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    train_instance_count=1, # The number of instances to use for training
                                    train_instance_type='ml.m4.xlarge', # The type of instance to use for training
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                                                        # Where to save the output (the model artifacts)
                                    sagemaker_session=session) # The current SageMaker session

NameError: name 'session' is not defined

In [14]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=200)

NameError: name 'xgb' is not defined

In [None]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

### 6. Test the model

In [None]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [None]:
xgb_transformer.wait()

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

In [None]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)

In [None]:

plt.scatter(Y_test, Y_pred)
plt.xlabel("Median Price")
plt.ylabel("Predicted Price")
plt.title("Median Price vs Predicted Price")

### 7. Clean up

In [None]:
# First we will remove all of the files contained in the data_dir directory
!rm $data_dir/*

# And then we delete the directory itself
!rmdir $data_dir