# Amazon SageMaker Train models and generate Data sets
With [Amazon SageMaker Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform.html), Use batch transform when you need to do the following:

    Preprocess datasets to remove noise or bias that interferes with training or inference from your dataset.

    Get inferences from large datasets.

    Run inference when you don't need a persistent endpoint.

    Associate input records with inferences to assist the interpretation of results.

**Caution:** This is a reference material and not to be used in production as-is. Please run your tests with your data thoroughly as we are not liable for any loss of data

We use this file to generate the models to be used for Inferenecing workshops and to generate the synthetic data set to be used for inferencing

RAW Data set for the generated raw housing prices - per region so data/raw/region/ny
Model for processing the data 
Curated Data set for the processed before training the Regression model - so data/curated/region/ny




### Contents

1. [Generate synthetic data for housing models](#Generate-synthetic-data-for-housing-models)
1. [TRANSFORM the raw housing data using Scikit Learn model](#Preprocess-synthetic-housing-data-using-scikit-learn)
1. [Clean up](#CleanUp)


## Section 1 - Generate synthetic data for housing models <a id='Generate-synthetic-data-for-housing-models'></a>

In this section, you will generate synthetic data that will be used to train the linear learner models.  The data generated consists of 6 numerical features - the year the house was built in, house size in square feet, number of bedrooms, number of bathroom, the lot size and number of garages and two categorial features - deck and front_porch.  

In [1]:
import numpy as np
import pandas as pd
import json
import datetime
import time
import boto3
import sagemaker
import os

from time import gmtime, strftime
from random import choice

from sagemaker import get_execution_role

from sagemaker.multidatamodel import MULTI_MODEL_CONTAINER_MODE
from sagemaker.multidatamodel import MultiDataModel

from sklearn.model_selection import train_test_split

In [2]:
sm_client = boto3.client(service_name='sagemaker')
runtime_sm_client = boto3.client(service_name='sagemaker-runtime')
sagemaker_session = sagemaker.Session()

s3 = boto3.resource('s3')
s3_client = boto3.client('s3')

BUCKET  = sagemaker_session.default_bucket()
print("BUCKET : ", BUCKET)

role = get_execution_role()
print("ROLE : ", role)

ACCOUNT_ID = boto3.client('sts').get_caller_identity()['Account']
REGION = boto3.Session().region_name

DATA_PREFIX = 'DEMO_MME_LINEAR_LEARNER'
HOUSING_MODEL_NAME = 'housing'
MULTI_MODEL_ARTIFACTS = 'multi_model_artifacts'



BUCKET :  sagemaker-us-east-1-622343165275
ROLE :  arn:aws:iam::622343165275:role/service-role/AmazonSageMaker-ExecutionRole-20220208T115633


In [3]:
NUM_HOUSES_PER_LOCATION = 1000
LOCATIONS  = ['NewYork_NY',    'LosAngeles_CA',   'Chicago_IL',    'Houston_TX',   'Dallas_TX',
              'Phoenix_AZ',    'Philadelphia_PA', 'SanAntonio_TX', 'SanDiego_CA',  'SanFrancisco_CA']
MAX_YEAR = 2019

In [4]:
def gen_price(house):
    """Generate price based on features of the house"""
    
    if house['FRONT_PORCH'] == 'y':
        garage = 1
    else:
        garage = 0
        
    if house['FRONT_PORCH'] == 'y':
        front_porch = 1
    else:
        front_porch = 0
        
    price = int(150 * house['SQUARE_FEET'] + \
                10000 * house['NUM_BEDROOMS'] + \
                15000 * house['NUM_BATHROOMS'] + \
                15000 * house['LOT_ACRES'] + \
                10000 * garage + \
                10000 * front_porch + \
                15000 * house['GARAGE_SPACES'] - \
                5000 * (MAX_YEAR - house['YEAR_BUILT']))
    return price

In [5]:
def gen_yes_no():
    """Generate values (y/n) for categorical features"""
    answer = choice(['y', 'n'])
    return answer

In [6]:
def gen_random_house():
    """Generate a row of data (single house information)"""
    house = {'SQUARE_FEET':    np.random.normal(3000, 750),
             'NUM_BEDROOMS':  np.random.randint(2, 7),
             'NUM_BATHROOMS': np.random.randint(2, 7) / 2,
             'LOT_ACRES':     round(np.random.normal(1.0, 0.25), 2),
             'GARAGE_SPACES': np.random.randint(0, 4),
             'YEAR_BUILT':    min(MAX_YEAR, int(np.random.normal(1995, 10))),
             'FRONT_PORCH':   gen_yes_no(),
             'DECK':          gen_yes_no()
            }
    
    price = gen_price(house)
    
    return [house['YEAR_BUILT'],   
            house['SQUARE_FEET'], 
            house['NUM_BEDROOMS'], 
            house['NUM_BATHROOMS'], 
            house['LOT_ACRES'],    
            house['GARAGE_SPACES'],
            house['FRONT_PORCH'],    
            house['DECK'], 
            price]

In [7]:
def gen_houses(num_houses):
    """Generate housing dataset"""
    house_list = []
    
    for _ in range(num_houses):
        house_list.append(gen_random_house())
        
    df = pd.DataFrame(
        house_list, 
        columns=[
            'YEAR_BUILT',    
            'SQUARE_FEET',  
            'NUM_BEDROOMS',            
            'NUM_BATHROOMS',
            'LOT_ACRES',
            'GARAGE_SPACES',
            'FRONT_PORCH',
            'DECK', 
            'PRICE']
    )
    return df

In [38]:
def save_data_locally(location, train, test): 
    """Save the housing data locally"""
    os.makedirs('data/{0}/train'.format(location), exist_ok=True)
    train.to_csv('data/{0}/train/train.csv'.format(location), sep=',', header=False, index=False)
       
    os.makedirs('data/{0}/test'.format(location), exist_ok=True)
    test.to_csv('data/{0}/test/test.csv'.format(location), sep=',', header=False, index=False) 
    print(f"train:saved:to data/{location}/train/train.csv ")
    print(f"Test:saved:to data/{location}/test/test.csv ")
    


#### We can launch 'n' number of jobs (Processing / Training ) in parallel with the same Object and different inputs

**To demonstrate this feature change the PARALLEL_TRAINING_JOBS to higher value**

In [57]:
# Generate housing data for multiple locations.
# Change "PARALLEL_TRAINING_JOBS " to a lower number to limit the number of training jobs and models. 
# Or to a higher value to experiment with more models.

PARALLEL_TRAINING_JOBS = 1

for loc in LOCATIONS[:PARALLEL_TRAINING_JOBS]:
    houses = gen_houses(NUM_HOUSES_PER_LOCATION)
    
    #Spliting data into train and test in 90:10 ratio
    #Not splitting the train data into train and val because its not preprocessed yet
    train, test = train_test_split(houses, test_size=0.1)
    save_data_locally(loc, train, test)


train:saved:to data/NewYork_NY/train/train.csv 
Test:saved:to data/NewYork_NY/test/test.csv 


In [62]:
#Shows the first few lines of data.
location='NewYork_NY'
houses.head(2)
houses.to_csv('data/{0}/train/train.csv'.format(location), sep=',', header=False, index=False)

## Section 2 - Preprocess the raw housing data using Scikit Learn <a id='Preprocess-synthetic-housing-data-using-scikit-learn'></a>

In this section, the categorical features of the data (deck and porch) are pre-processed using sklearn to convert them to one hot encoding representation.  



In [59]:
houses.columns

Index(['YEAR_BUILT', 'SQUARE_FEET', 'NUM_BEDROOMS', 'NUM_BATHROOMS',
       'LOT_ACRES', 'GARAGE_SPACES', 'FRONT_PORCH', 'DECK', 'PRICE'],
      dtype='object')

In [11]:
import joblib

In [12]:
!mkdir -p scripts

In [13]:
%%writefile scripts/sklearn_preprocessor_batch.py
from __future__ import print_function

import argparse
import csv
import json
import os
import shutil
import sys
import time
from io import StringIO

import numpy as np
import pandas as pd
from sagemaker_containers.beta.framework import (
    content_types,
    encoders,
    env,
    modules,
    transformer,
    worker,
)
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer, OneHotEncoder, StandardScaler

from sklearn.externals import joblib

# Since we get a headerless CSV file we specify the column names here.
feature_columns_names = [
    "YEAR_BUILT",
    "SQUARE_FEET",
    "NUM_BEDROOMS",
    "NUM_BATHROOMS",
    "LOT_ACRES",
    "GARAGE_SPACES",
    "FRONT_PORCH",
    "DECK",
]

label_column = "PRICE"

feature_columns_dtype = {
    "YEAR_BUILT": str,
    "SQUARE_FEET": np.float64,
    "NUM_BEDROOMS": np.float64,
    "NUM_BATHROOMS": np.float64,
    "LOT_ACRES": np.float64,
    "GARAGE_SPACES": np.float64,
    "FRONT_PORCH": str,
    "DECK": str,
}

label_column_dtype = {"PRICE": np.float64}


if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])

    args = parser.parse_args()

    # Take the set of files and read them all into a single pandas dataframe
    input_files = [os.path.join(args.train, file) for file in os.listdir(args.train)]
    if len(input_files) == 0:
        raise ValueError(
            (
                "There are no files in {}.\n"
                + "This usually indicates that the train channel was incorrectly specified,\n"
                + "the data specification in S3 was incorrectly specified or the role specified\n"
                + "does not have permission to access the data.".format(args.train)
            )
        )

    for file in input_files:
        print("file :", file)

    raw_data = [pd.read_csv(file, header=None, names=feature_columns_names + [label_column])]

    concat_data = pd.concat(raw_data)

    print(concat_data)

    # This section is adapted from the scikit-learn example of using preprocessing pipelines:
    #
    # https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
    #

    numeric_features = list(feature_columns_names)
    numeric_features.remove("FRONT_PORCH")
    numeric_features.remove("DECK")
    numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

    categorical_features = ["FRONT_PORCH", "DECK"]
    categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ],
        remainder="drop",
    )

    preprocessor.fit(concat_data)
    
    joblib.dump(preprocessor, os.path.join(args.model_dir, "model.joblib"))

    print("saved model!")


def input_fn(input_data, content_type):
    """Parse input data payload

    We currently only take csv input. Since we need to process both labelled
    and unlabelled data we first determine whether the label column is present
    by looking at how many columns were provided.
    """
    if content_type == "text/csv":
        # Read the raw input data as CSV.
        df = pd.read_csv(StringIO(input_data), header=None)

        if len(df.columns) == len(feature_columns_names) + 1:
            # This is a labelled example, includes the ring label
            df.columns = feature_columns_names + [label_column]
        elif len(df.columns) == len(feature_columns_names):
            # This is an unlabelled example.
            df.columns = feature_columns_names

        return df
    else:
        raise ValueError("{} not supported by script!".format(content_type))


def output_fn(prediction, accept):
    """Format prediction output

    The default accept/content-type between containers for serial inference is JSON.
    We also want to set the ContentType or mimetype as the same value as accept so the next
    container can read the response payload correctly.
    """
    if accept == "application/json":
        instances = []
        for row in prediction.tolist():
            instances.append({"features": row})

        json_output = {"instances": instances}

        return worker.Response(json.dumps(json_output), mimetype=accept)
    elif accept == "text/csv":
        return worker.Response(encoders.encode(prediction, accept), mimetype=accept)
    else:
        raise RuntimeException("{} accept type is not supported by this script.".format(accept))


def predict_fn(input_data, model):
    """Preprocess input data

    We implement this because the default uses .predict(), but our model is a preprocessor
    so we want to use .transform().

    The output is returned in the following order:

        rest of features either one hot encoded or standardized
    """

    print("Input data type ", type(input_data))

    print(input_data)

    features = model.transform(input_data)

    print("features type ", type(features))

    print(features)

    features_array = features

    print("features_array ", type(features_array))

    print(features_array)

    if label_column in input_data:
        # Return the label (as the first column) and the set of features.
        return np.insert(features_array, 0, input_data[label_column], axis=1)
    else:
        # Return only the set of features
        return features


def model_fn(model_dir):
    """Deserialize fitted model"""
    preprocessor = joblib.load(os.path.join(model_dir, "model.joblib"))
    return preprocessor



Overwriting scripts/sklearn_preprocessor_batch.py


In [14]:
#Create the SKLearn estimator with the sklearn_preprocessor.py as the script
from sagemaker.sklearn.estimator import SKLearn

script_path = 'scripts/sklearn_preprocessor_batch.py'

sklearn_estimator = SKLearn(
    entry_point=script_path,
    role=role,
    instance_type="ml.c4.xlarge",
    framework_version="0.20.0",
    sagemaker_session=sagemaker_session)

In [15]:
#Upload the raw training data to S3 bucket, to be accessed by SKLearn
train_inputs = []

for loc in LOCATIONS[:PARALLEL_TRAINING_JOBS]:

    train_input = sagemaker_session.upload_data(
        path='data/{}/train/train.csv'.format(loc),
        bucket=BUCKET,
        key_prefix='housing-data/{}/train'.format(loc)
    )
    
    train_inputs.append(train_input)
    print("Raw training data uploaded to : ", train_input)

Raw training data uploaded to :  s3://sagemaker-us-east-1-622343165275/housing-data/NewYork_NY/train/train.csv


In [16]:
##Launch multiple scikit learn training to process the raw synthetic data generated for multiple locations.
##Before executing this, take the training instance limits in your account and cost into consideration.

sklearn_estimators = []
sklearn_estimator_jobs = []

for index, loc in enumerate(LOCATIONS[:PARALLEL_TRAINING_JOBS]):
    print("sklearn_estimator fit input data at ", index , " for loc ", loc)
     
    job_name='scikit-learnestimator-{}'.format(strftime('%Y-%m-%d-%H-%M-%S', gmtime()))
    
    sklearn_estimator.fit({'train': train_inputs[index]}, job_name=job_name, wait=False)

    sklearn_estimators.append(sklearn_estimator)
    sklearn_estimator_jobs.append(job_name)
    
    time.sleep(1)

sklearn_estimator fit input data at  0  for loc  NewYork_NY


In [17]:
#Wait for the preprocessor jobs to finish
for job_name in sklearn_estimator_jobs:
    print('Waiting for job {} to complete...'.format(job_name))
    
    waiter = sm_client.get_waiter('training_job_completed_or_stopped')
    waiter.wait(TrainingJobName=job_name)
    
print("Training Jobs completed")

Waiting for job scikit-learnestimator-2022-09-29-04-05-22 to complete...
Training Jobs completed


### Save these models locally and then upload them to 
**bucket/inferencewk/models/batch/loc**

In [33]:
!mkdir -p models/batch
print(sklearn_estimators[0].model_data)
print(sklearn_estimators[0].base_job_name)

print(BUCKET)

s3://sagemaker-us-east-1-622343165275/scikit-learnestimator-2022-09-29-04-05-22/output/model.tar.gz
None
sagemaker-us-east-1-622343165275


In [34]:
for index, loc in enumerate(LOCATIONS[:PARALLEL_TRAINING_JOBS]):
    estimator = sklearn_estimators[index]
    model_loc = estimator.model_data
    sagemaker.s3.S3Downloader().download(
        s3_uri=model_loc,
        local_path=f'models/batch/{loc}',
        sagemaker_session=sagemaker_session
    )
!ls -alrt models/batch

total 20
-rw-rw-r-- 1 ec2-user ec2-user 1489 Sep  8 16:44 model.tar.gz_OLD
drwxrwxr-x 4 ec2-user ec2-user 4096 Sep  8 16:44 ..
drwxrwxr-x 2 ec2-user ec2-user 4096 Sep 29 04:50 .ipynb_checkpoints
drwxrwxr-x 4 ec2-user ec2-user 4096 Sep 29 04:53 .
drwxrwxr-x 2 ec2-user ec2-user 4096 Sep 29 04:59 NewYork_NY


In [45]:
# Uplaod to s3
batch_model_inputs=[""*PARALLEL_TRAINING_JOBS]
for index, loc in enumerate(LOCATIONS[:PARALLEL_TRAINING_JOBS]):
    s3_model_loc = sagemaker.s3.S3Uploader().upload(
        local_path=f'models/batch/{loc}/model.tar.gz',
        desired_s3_uri=f's3://{BUCKET}/inferencewk/models/batch/{loc}',
        sagemaker_session=sagemaker_session
    )
    print(s3_model_loc)
    batch_model_inputs[index]=s3_model_loc
print(batch_model_inputs)

s3://sagemaker-us-east-1-622343165275/inferencewk/models/batch/NewYork_NY/model.tar.gz
['s3://sagemaker-us-east-1-622343165275/inferencewk/models/batch/NewYork_NY/model.tar.gz']


In [42]:
# Upload the Raw data set to inferencewk/data/raw/loc
train_inputs=[""*PARALLEL_TRAINING_JOBS]
test_inputs=[""*PARALLEL_TRAINING_JOBS]

for index, loc in enumerate(LOCATIONS[:PARALLEL_TRAINING_JOBS]):
    s3_train_loc = sagemaker.s3.S3Uploader().upload(
        local_path=f'data/{loc}/train/train.csv',
        desired_s3_uri=f's3://{BUCKET}/inferencewk/data/raw/{loc}/train',
        sagemaker_session=sagemaker_session
    )
    print(f"train:loc:uploaded:{s3_train_loc}:WITHOUT HEADERS:")
    train_inputs[index] = s3_train_loc

    s3_test_loc = sagemaker.s3.S3Uploader().upload(
        local_path=f'data/{loc}/test/test.csv',
        desired_s3_uri=f's3://{BUCKET}/inferencewk/data/raw/{loc}/test',
        sagemaker_session=sagemaker_session
    )
    print(f"test:loc:uploaded:{s3_test_loc}:WITHPUT HEADERS")
    test_inputs[index] = s3_test_loc
 
print(train_inputs)

train:loc:uploaded:s3://sagemaker-us-east-1-622343165275/inferencewk/data/raw/NewYork_NY/train/train.csv:WITHOUT HEADERS:
test:loc:uploaded:s3://sagemaker-us-east-1-622343165275/inferencewk/data/raw/NewYork_NY/test/test.csv:WITHPUT HEADERS
['s3://sagemaker-us-east-1-622343165275/inferencewk/data/raw/NewYork_NY/train/train.csv']


In [43]:
train_inputs[0]

's3://sagemaker-us-east-1-622343165275/inferencewk/data/raw/NewYork_NY/train/train.csv'