In [None]:
# Sage maker notebook. This notebook uses the BileRental data and XGBoost algorithm
# The output file will be available for upload to kaggle for the kaggle score ( Need to add this to Readme)

In [1]:
# Install xgboost on the notebook instance
!conda install -y -c conda-forge xgboost

Solving environment: done


  current version: 4.5.12
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/ec2-user/anaconda3/envs/python3

  added / updated specs: 
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    xgboost-0.90               |   py36he1b5a44_4          11 KB  conda-forge
    py-xgboost-0.90            |           py36_4          73 KB  conda-forge
    openssl-1.0.2t             |       h14c3975_0         3.1 MB  conda-forge
    mkl_random-1.0.2           |           py36_0         1.3 MB  conda-forge
    tbb-2019.9                 |       hc9558a2_0         1.4 MB  conda-forge
    mkl_fft-1.0.10             |           py36_0         650 KB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    tbb4py-2019.9        

In [17]:
# import numpy, pandas, sagemaker, xgboost

import sys
import math
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import boto3

import sagemaker
from sagemaker import get_execution_role

# XGBoost
import xgboost as xgb

In [2]:
# Sagemaker requires that the target variable be the first variable. 
# Define the columns to include count as the first vavariable
columns = ['count', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']

In [5]:
df = pd.read_csv('train.csv', parse_dates=['datetime'], index_col=0)

In [6]:
df_test = pd.read_csv('test.csv', parse_dates=['datetime'], index_col=0)

In [7]:
# from the index feature get the year, month, day, dayofweek, and hour as new columns
def add_features(df):
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['hour'] = df.index.hour

In [8]:
# add the new features to the df and df_test data frame
add_features(df)
add_features(df_test)

In [None]:
# for the train data frame look at the target column count 
df['count'].describe()

In [9]:
# Log on the count column
df['count'] = df['count'].map(np.log1p)

In [10]:
# copy the data in the data frame into a csv file. The csv file will be uploaded to s3 bucket to be used by the model
df.to_csv('bike_all.csv', index=True, index_label='datetime', columns=columns)

# Training and Validation data
 Have 70% of the training dataset and 30% of the validation data set
 Sagemaker requires that the files do not have headers and the target column be the first column

### Train, validation, and test data is written into files. Columns list is also written into a text file.
### This is done for pushing the data into s3 for consumption using Sagemaker.

In [12]:
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df =df.loc[l]

In [11]:
rows = df.shape[0]
train = int(.7*rows)
test = rows - train

rows, train, test

(10886, 7620, 3266)

In [13]:
df[:train].to_csv('bike_train.csv',
                  index=False, header=False,
                  columns=columns)

In [14]:
df[train:].to_csv('bike_validation.csv',
                   index=False, header=False,
                   columns=columns)

In [15]:
# Test data has on input features
df_test.to_csv('bike_test.csv',index=True,index_label='datetime')

In [16]:
with open('bike_train_column_list.txt','w') as f:
    f.write(','.join(columns))

# Sagemaker section starts here

In [18]:
# specify the bucket name
# specify the folder path within the S3 bucket 
# for training, test, and validation files
bucket_name = 'napa-ml-sagemaker'

training_folder = 'bikerental/training/'
test_folder = 'bikerental/test'
validation_folder = 'bikerental/validation'


s3_model_output_location = r's3://{0}/bikerental/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name, test_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name, validation_folder)

In [19]:
# It is easy to write and read from s3
# files are referred to as objects in s3
# file names are referred to as key name in s3

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [20]:
# Write the files that were created with the data 
# for the training, test, and validation 
# along with the columns list to s3
# for further usage in the sagemaker models

write_to_s3('bike_train.csv',
             bucket_name,
             training_folder + 'bike_train.csv')

write_to_s3('bike_validation.csv',
             bucket_name,
             validation_folder + 'bike_validation.csv')

write_to_s3('bike_test.csv',
             bucket_name,
             test_folder + 'bike_test.csv')

# Training Algorithm Docker image
### Sagemaker maintains a separate docker image for algorithm and region

In [21]:
# Establish a sagemaker session with AWS
sess = sagemaker.Session()

In [23]:
# Get the role that has the permission to train, deploy models in Sagemaker
role = get_execution_role()

In [24]:
# Sagemaker API mantains the algorithm container mapping
# Specify the region, algorithm , and version
container = sagemaker.amazon.amazon_estimator.get_image_uri(
             sess.boto_region_name,
             "xgboost",
             "latest")

print("Using Sagemaker container: \n {} ({})".format(container, sess.boto_region_name))

	get_image_uri(region, 'xgboost', '0.90-1').


Using Sagemaker container: 
 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest (us-east-1)


# Configure the training job
# Specify type and number of instances to use
# S3 location where final artifacts need to be stored

estimator = sagemaker.estimator.Estimator(
            container,
            role,
            train_instance_count=1,
            train_instance_type='ml.m4.xlarge',
            output_path=s3_model_output_location,
            sagemaker_session=sess,
            base_job_name='xgboost-bikerental-v1')

In [26]:
# Specify the hyperparameter that is appropiate for the algorithm

# max_depth=5, eta(learning rate)=0.1, subsample=0.7, num_round=150
estimator.set_hyperparameters(max_depth=5,
                              objective="reg:linear",
                              eta=0.1,
                              num_round=150)

In [29]:
estimator.hyperparameters()

{'max_depth': 5, 'objective': 'reg:linear', 'eta': 0.1, 'num_round': 150}

## Specify the training data and validation data location

In [30]:
# content type can be lisvm or csv for XGBoost
training_input_config = sagemaker.session.s3_input(
            s3_data=s3_training_file_location,
            content_type='csv',
            s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.s3_input(
            s3_data=s3_validation_file_location,
            content_type='csv',
            s3_data_type='S3Prefix')

data_channels = {'train':training_input_config, 'validation':validation_input_config}

### Train the model

In [31]:
# XGBoost uses the train and validation channels
estimator.fit(data_channels)

2019-11-10 15:36:28 Starting - Starting the training job...
2019-11-10 15:36:29 Starting - Launching requested ML instances......
2019-11-10 15:37:31 Starting - Preparing the instances for training......
2019-11-10 15:38:48 Downloading - Downloading input data...
2019-11-10 15:39:22 Training - Downloading the training image...
2019-11-10 15:39:52 Uploading - Uploading generated training model
2019-11-10 15:39:52 Completed - Training job completed
[31mArguments: train[0m
[31m[2019-11-10:15:39:41:INFO] Running standalone xgboost training.[0m
[31m[2019-11-10:15:39:41:INFO] File size need to be processed in the node: 0.65mb. Available memory size in the node: 8584.8mb[0m
[31m[2019-11-10:15:39:41:INFO] Determined delimiter of CSV input is ','[0m
[31m[15:39:41] S3DistributionType set as FullyReplicated[0m
[31m[15:39:41] 7620x13 matrix with 99060 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-11-10:15:39:41:INFO] Determined delimit

Training seconds: 64
Billable seconds: 64


### Deploy the model

In [32]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name='xgboost-bikerental-v1')

---------------------------------------------------------------------------------------------------!

## Invoke the XGBoost cloud prediction template 

In [33]:
endpoint_name = 'xgboost-bikerental-v1'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [34]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [35]:
# get the test data from the local notebook instance into a data frame
df_all = pd.read_csv('bike_test.csv')

In [37]:
df_all.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek',
       'hour'],
      dtype='object')

In [36]:
# Need to pass an array to the prediction
# can pass a numpy array or a list of values 
arr_test = df_all[df_all.columns[1:]].values

In [38]:
# for large number of predictions, we can split the input data and
# Query the prediction service,
# array_split is convenient to specify how many spllits are needed
predictions = []
for arr in np.array_split(arr_test, 10):
    result = predictor.predict(arr)
    result = result.decode("utf-8")
    result = result.split(',')
    print(arr.shape)
    predictions += [float(r) for r in result]

(650, 13)
(650, 13)
(650, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)


In [41]:
np.expm1(predictions)

array([  9.29977784,   5.68963495,   4.11209937, ..., 133.68598006,
        92.35797767,  52.00191752])

In [42]:
df_all['count'] = np.expm1(predictions)

In [45]:
df_all[['datetime','count']].to_csv('predicted_count_cloud.csv', index=False )