## Boston House pricing Binary Classification problem

Case: Return YES if the new house is predicted to be worth more than $22000. No if not.  

1. Load dataset onto notebook instance from S3
2. Clean, transform and Prepare the dataset
3. Create and train linear learner model
4. Deploy the model into SageMaker hosting

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import io
import sagemaker.amazon.common as smac

import boto3
from sagemaker import get_execution_role
import sagemaker

import matplotlib.pyplot as plt
import seaborn as sns

### Step1: Load the data from S3

In [2]:
role = get_execution_role()
bucket = 'boston-house-bucket'
sub_folder = 'boston-house-data'
data_key = 'boston_housing_raw.csv'
data_location = 's3://{}/{}/{}'.format(bucket, sub_folder, data_key)

df = pd.read_csv(data_location, low_memory = False)
df.head()

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,B,LSTAT,MEDV
0,0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


### Step2: Clean, Transform and Prepare the dataset


See [Variable description](http://lib.stat.cmu.edu/datasets/boston)

1. Convert CHAS, RAD varibales into categorical and one-hot encode them
2. MinMaxScale the data so that all the points will be in 0 to 1 range
3. Find the scaled value for $22000

In [3]:
#check if there are any missing values
df.isnull().values.any()

False

In [4]:
#drop unrequired columns
df.drop(columns = ['Unnamed: 0'], inplace = True )


In [5]:
#convert CHAS, RAD attributes to categorical
df['CHAS'] = df['CHAS'].astype('category')
df['RAD'] = df['RAD'].astype('category')

#one-hot encode CHAS, RAD attributes
df = pd.get_dummies(df, columns=['CHAS', 'RAD'])

df.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PRATIO,B,...,CHAS_1,RAD_1,RAD_2,RAD_3,RAD_4,RAD_5,RAD_6,RAD_7,RAD_8,RAD_24
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.09,296.0,15.3,396.9,...,0,1,0,0,0,0,0,0,0,0
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,242.0,17.8,396.9,...,0,0,1,0,0,0,0,0,0,0
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,242.0,17.8,392.83,...,0,0,1,0,0,0,0,0,0,0
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,222.0,18.7,394.63,...,0,0,0,1,0,0,0,0,0,0
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,222.0,18.7,396.9,...,0,0,0,1,0,0,0,0,0,0


In [6]:
df.shape

(506, 23)

In [7]:
#scale the data to evenly distribute between 0 and 1
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
data_scaled = scaler.fit_transform(df)


  return self.partial_fit(X, y)


In [8]:
#minmaxscaler converts dataframe to ndarray, convert it back to data frame
df_scaled = pd.DataFrame(data = data_scaled, columns = list(df) )
df_scaled.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PRATIO,B,...,CHAS_1,RAD_1,RAD_2,RAD_3,RAD_4,RAD_5,RAD_6,RAD_7,RAD_8,RAD_24
0,0.0,0.18,0.067815,0.314815,0.577505,0.641607,0.269203,0.208015,0.287234,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000236,0.0,0.242302,0.17284,0.547998,0.782698,0.348962,0.104962,0.553191,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000236,0.0,0.242302,0.17284,0.694386,0.599382,0.348962,0.104962,0.553191,0.989737,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000293,0.0,0.06305,0.150206,0.658555,0.441813,0.448545,0.066794,0.648936,0.994276,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000705,0.0,0.06305,0.150206,0.687105,0.528321,0.448545,0.066794,0.648936,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_scaled['MEDV'].head()

0    0.422222
1    0.368889
2    0.660000
3    0.631111
4    0.693333
Name: MEDV, dtype: float64

In [10]:
df_scaled.corr()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PRATIO,B,...,CHAS_1,RAD_1,RAD_2,RAD_3,RAD_4,RAD_5,RAD_6,RAD_7,RAD_8,RAD_24
CRIM,1.0,-0.200469,0.406583,0.420972,-0.219247,0.352734,-0.37967,0.582764,0.289946,-0.385064,...,-0.055892,-0.084456,-0.091673,-0.116598,-0.197472,-0.18465,-0.093806,-0.075143,-0.084191,0.632302
ZN,-0.200469,1.0,-0.533828,-0.516604,0.311991,-0.569537,0.664408,-0.314563,-0.391679,0.17552,...,-0.042697,0.248457,0.086702,0.061369,0.07619,-0.005934,0.016154,0.122776,-0.048974,-0.28975
INDUS,0.406583,-0.533828,1.0,0.763651,-0.391676,0.644779,-0.708027,0.72076,0.383248,-0.356977,...,0.062938,-0.17969,-0.049031,-0.279307,-0.030002,-0.108959,-0.099794,-0.166009,-0.169444,0.603593
NOX,0.420972,-0.516604,0.763651,1.0,-0.302188,0.73147,-0.76923,0.668023,0.188933,-0.380051,...,0.091203,-0.160877,-0.134503,-0.25174,-0.229389,0.075839,-0.080115,-0.183122,-0.119886,0.604139
RM,-0.219247,0.311991,-0.391676,-0.302188,1.0,-0.240265,0.205246,-0.292048,-0.355501,0.128069,...,0.091251,0.078383,0.116137,0.076479,-0.113768,0.083847,-0.059651,0.096459,0.211773,-0.222159
AGE,0.352734,-0.569537,0.644779,0.73147,-0.240265,1.0,-0.747881,0.506456,0.261515,-0.273534,...,0.086518,-0.169885,-0.030186,-0.195206,-0.144934,0.012508,-0.06979,-0.188527,-0.00972,0.448516
DIS,-0.37967,0.664408,-0.708027,-0.76923,0.205246,-0.747881,1.0,-0.534432,-0.232471,0.291512,...,-0.099176,0.215315,0.032031,0.183063,0.159835,-0.025191,0.025432,0.239385,0.065296,-0.489642
TAX,0.582764,-0.314563,0.72076,0.668023,-0.292048,0.506456,-0.534432,1.0,0.460853,-0.441808,...,-0.035587,-0.14071,-0.195576,-0.274436,-0.226179,-0.246066,-0.048868,-0.114976,-0.141791,0.909506
PRATIO,0.289946,-0.391679,0.383248,0.188933,-0.355501,0.261515,-0.232471,0.460853,1.0,-0.177383,...,-0.121515,-0.083528,-0.120079,-0.037827,0.165909,-0.478578,-0.068886,-0.00428,-0.049578,0.479177
B,-0.385064,0.17552,-0.356977,-0.380051,0.128069,-0.273534,0.291512,-0.441808,-0.177383,1.0,...,0.048788,0.072508,0.072762,0.111674,0.15052,0.074438,0.078322,0.064926,0.069982,-0.446748


In [11]:
df_scaled['MEDV'].describe()

count    506.000000
mean       0.389618
std        0.204380
min        0.000000
25%        0.267222
50%        0.360000
75%        0.444444
max        1.000000
Name: MEDV, dtype: float64

In [12]:
df['MEDV'].describe()

count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: MEDV, dtype: float64

In [14]:
#this calculation gives scaled down value for any single number(i) if its in MEDV range.
x = df['MEDV']
i = 22

if i in range(len(x)):
    i_scl = ([(i - min(x)) / (max(x) - min(x))]) 
    print("Scaled value of i:", i_scl)
else:
    print('Value not in range')

Scaled value of i: [0.37777777777777777]


### Step 3: Create and Train Linear Learner  model


In [15]:
#randomize data and split data into train, validation and test sets
np.random.seed(0)

rand_split = np.random.rand(len(df_scaled))

train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split <= 0.9

#datasets for training, validating and testing
data_train = df_scaled[train_list]
data_val = df_scaled[val_list]
data_test = df_scaled[test_list]

#convert data sets into numpy.ndarray. X is features and Y is labels

train_X = data_train.drop(columns = 'MEDV').to_numpy() 
train_Y = ((data_train['MEDV'] > 0.377777)+0).to_numpy() #values above 0.37 will return as 1, and below will be as 0.

val_X = data_val.drop(columns = 'MEDV').to_numpy()
val_Y = ((data_val['MEDV'] > 0.377777)+0).to_numpy()

test_X = data_test.drop(columns = 'MEDV').to_numpy()
test_Y = ((data_val['MEDV'] > 0.377777)+0).to_numpy()      

In [39]:
#Create recordIO protobuf type float32 for training data
train_file = 'boston_housing_train_recordIO_protobuf.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'),
                                train_Y.astype('float32'))
f.seek(0)

#Upload to S3
boto3.Session().resource('s3').Bucket(bucket).Object('linearlearner_train/{}'.format(train_file)).upload_fileobj(f)

#location of the training data in S3
train_channel = 's3://{}/linearlearner_train/{}'.format(bucket,train_file)

In [40]:
#create recordIO protobuf type32 for validation data
validation_file = 'boston_housing_validation_recordIO_protobuf.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'),
                                val_Y.astype('float32'))
f.seek(0)

#upload to S3
boto3.Session().resource('s3').Bucket(bucket).Object('linearlearner_validation/{}'.format(validation_file,)).upload_fileobj(f)

#location of the validation data in S3
validation_channel = 's3://{}/linearlearner_validation/{}'.format(bucket,validation_file)

In [41]:
# Import the Amazon SageMaker Python SDK and get the linear-learner container.

import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'linear-learner',"1")

In [46]:
#create a training job name
job_name = 'bh-linear-learner-job-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))
print('job name{}'.format(job_name))

#output path of the model artifacts
output_location = 's3://{}/linearlearner-output'.format(bucket)


job namebh-linear-learner-job-20200513221254


In [47]:
print('The feature_dim hyperparameter needs to be set to {}.'.format(data_train.shape[1]-1)) 

The feature_dim hyperparameter needs to be set to 22.


In [48]:
#session objest manages interactions with necassary AWS services
sess = sagemaker.Session()

#set up linear algorithm from ECR
linear = sagemaker.estimator.Estimator(container,
                                      role,
                                      train_instance_count =1,
                                      train_instance_type = 'ml.c4.xlarge',
                                      output_path=output_location,
                                      sagemaker_session=sess,
                                      input_mode='Pipe')

#set up hyperparameters
linear.set_hyperparameters(feature_dim = 22,
                          predictor_type = 'binary_classifier',
                          mini_batch_size = 300)

#launch training job. This method calls the CreateTrainingJob API call
data_channels = {
    'train': train_channel,
    'validation': validation_channel
}
linear.fit(data_channels, job_name=job_name)

2020-05-13 22:12:56 Starting - Starting the training job...
2020-05-13 22:12:58 Starting - Launching requested ML instances......
2020-05-13 22:14:24 Starting - Preparing the instances for training.........
2020-05-13 22:15:51 Downloading - Downloading input data
2020-05-13 22:15:51 Training - Downloading the training image...
2020-05-13 22:16:23 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/13/2020 22:16:15 INFO 140136610039616] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5'

[34m#metrics {"Metrics": {"validation_binary_classification_cross_entropy_objective": {"count": 1, "max": 0.6281419149259242, "sum": 0.6281419149259242, "min": 0.6281419149259242}}, "EndTime": 1589408176.44075, "Dimensions": {"model": 10, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 1}, "StartTime": 1589408176.440733}
[0m
[34m#metrics {"Metrics": {"validation_binary_classification_cross_entropy_objective": {"count": 1, "max": 0.6855619477062691, "sum": 0.6855619477062691, "min": 0.6855619477062691}}, "EndTime": 1589408176.440808, "Dimensions": {"model": 11, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 1}, "StartTime": 1589408176.440792}
[0m
[34m#metrics {"Metrics": {"validation_binary_classification_cross_entropy_objective": {"count": 1, "max": 0.4289155820520913, "sum": 0.4289155820520913, "min": 0.4289155820520913}}, "EndTime": 1589408176.440867, "Dimensions": {"model": 12, "Host": "algo-1", "Operation":


2020-05-13 22:16:30 Completed - Training job completed
Training seconds: 56
Billable seconds: 56


In [49]:
print('location of the model:{}/{}/output/model.tar.gz').format(output_location, job_name)

location of the model:{}/{}/output/model.tar.gz


AttributeError: 'NoneType' object has no attribute 'format'