In [None]:
# default_exp core

# Setup


> The objective that we want to achieve is to establish the right playbooks that would help us in
  bringing more efficiencies in our ML training and deployment processes, by leveraging various
  components of Sagemaker Studio.

> This is the Setup for Heidelberg Cement Project. This file should be run only one time at intial stage only

## Imports

In [None]:
import sys
import sagemaker
import boto3
import json
import pandas as pd
import numpy as np
import logging
import random
import argparse
import time
import os
import io

from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from datetime import datetime
from sagemaker import get_execution_role
from sagemaker.spark.processing import PySparkProcessor
from argparse import Namespace, ArgumentParser


# to get the latest sagemaker python sdk
#!pip install -U sagemaker

## Constants

**NOTE:** Before running this notebook, be sure to set the stack name in the first code cell to match the name of the CloudFormation stack you used to create this notebook instance. If you used the default stack name, you should not need to make any updates.

In [None]:
STACK_NAME = 'heidelberg-mlops-stack-3' # if you're not using the default stack name, replace this
%store STACK_NAME

STREAM_NAME='heidelberg-mlops-stream'
%store STREAM_NAME

APPLICATION_NAME='heidelberg-mlops-app'
%store APPLICATION_NAME

LOCAL_DIR='data'
%store LOCAL_DIR

Stored 'STACK_NAME' (str)
Stored 'STREAM_NAME' (str)
Stored 'APPLICATION_NAME' (str)
Stored 'LOCAL_DIR' (str)


## Clients

In [None]:
role = get_execution_role()
sm = boto3.Session().client(service_name='sagemaker')
smfs_runtime = boto3.Session().client(service_name='sagemaker-featurestore-runtime')

boto_session = boto3.Session(region_name="us-east-1")
s3_client = boto3.Session().client('s3')
cf_client = boto3.client('cloudformation')
kinesis_client = boto3.client('kinesis')
lambda_client = boto3.client('lambda')
kda_client = boto3.client('kinesisanalytics')

## Data

1. Specify the Transformed data s3 location
1. If you have run the data_preparation_notebook then do not change this othewise specify your s3_data location

In [None]:
s3_bucket = sagemaker.Session().default_bucket()
s3_prefix = 'heidelberg/data'

s3_transformed_data_filename = "transformed.csv"

s3_transformed_data_path="s3://{}/{}/{}".format(s3_bucket,s3_prefix,s3_transformed_data_filename)
print("Make sure trasformed data is present at : " ,s3_transformed_data_path)


Make sure trasformed data is present at :  s3://sagemaker-us-east-1-082830052325/heidelberg/data/transformed.csv


# Setup

1. Create feature groups
2. Create an Amazon Kinesis data stream
3. Create an Amazon Kinesis Data Applications (KDA) application

### Get ARN's of Lambda functions from CloudFormation stack outputs
1. InvokeFraudEndpointLambdaARN
2. StreamingAggLambdaARN

In [None]:
try:
    outputs = cf_client.describe_stacks(StackName=STACK_NAME)['Stacks'][0]['Outputs']
    for o in outputs:
        if o['OutputKey'] == 'IngestLambdaFunctionARN':
            lambda_to_fs_arn = o['OutputValue']
        if o['OutputKey'] == 'PredictLambdaFunctionARN':
            lambda_to_model_arn = o['OutputValue']
        if o['OutputKey'] == 'PredictLambdaFunctionName':
            predict_lambda_name = o['OutputValue']

except:
    msg = f'CloudFormation stack {STACK_NAME} was not found. Please set the STACK_NAME properly and re-run this cell'
    sys.exit(ValueError(msg))

In [None]:
print(f'lambda_to_model_arn: {lambda_to_model_arn}')
print(f'lambda_to_fs_arn: {lambda_to_fs_arn}')
print(f'predict_lambda_name: {predict_lambda_name}')

lambda_to_model_arn: arn:aws:lambda:us-east-1:082830052325:function:HeidelbergInvokeFraudEndpointLambda3
lambda_to_fs_arn: arn:aws:lambda:us-east-1:082830052325:function:HeidelbergStreamingIngestAggFeatures3
predict_lambda_name: HeidelbergInvokeFraudEndpointLambda3


In [None]:
%store lambda_to_model_arn
%store predict_lambda_name

Stored 'lambda_to_model_arn' (str)
Stored 'predict_lambda_name' (str)


## Create feature groups 
When using Amazon SageMaker Feature Store, a core design decision is the definition of feature groups. 


In [None]:
import datetime as dt
import dateutil.parser as parser


def create_feature_group_from_s3_input(s3_bucket, s3_prefix, s3_filename, fg_name, record_identifier_name, event_time_feature_name,role_arn=None, s3_uri=None):
    s3 = boto3.resource('s3')
    obj = s3.Object(s3_bucket, s3_prefix+'/'+s3_filename)
    body = obj.get()['Body'].read()
    data=pd.read_csv(io.BytesIO(body))
    del data['target']
    data = data.round(5)
    data=data.fillna(0)
    
    feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sm,
    sagemaker_featurestore_runtime_client=smfs_runtime
    )
    
    feature_group_name = fg_name
    feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)
    l=len(data)
    feature_group.load_feature_definitions(data_frame=data);
    
    s3_uri="s3://{}".format(s3_bucket)

    # specify record_identifier_name and event_time_feature_name
    feature_group.create(
        s3_uri=s3_uri,
        record_identifier_name=record_identifier_name,
        event_time_feature_name=event_time_feature_name,
        role_arn=role,
        enable_online_store=True
    )

In [None]:
fg_name = 'heidelberg-mlops-fg'
%store fg_name

Stored 'fg_name' (str)


#### Create feature groups(if not already Created)

In [None]:
create_feature_group_from_s3_input(s3_bucket,s3_prefix,s3_transformed_data_filename,fg_name,'id','event_time',role) 

#### Show that the feature store is aware of the new feature groups

In [None]:
sm.list_feature_groups()

{'FeatureGroupSummaries': [{'FeatureGroupName': 'transaction-feature-group-15-05-22-59',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:082830052325:feature-group/transaction-feature-group-15-05-22-59',
   'CreationTime': datetime.datetime(2020, 12, 15, 5, 23, 10, 113000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'ml-ops-2',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:082830052325:feature-group/ml-ops-2',
   'CreationTime': datetime.datetime(2020, 12, 19, 6, 49, 19, 335000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'ml-ops-1',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:082830052325:feature-group/ml-ops-1',
   'CreationTime': datetime.datetime(2020, 12, 18, 7, 7, 15, 38000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'identity-feature-group-15-05-22-59',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:0828300523

#### Describe each feature group

In [None]:
sm.describe_feature_group(FeatureGroupName=fg_name)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:082830052325:feature-group/heidelberg-mlops-fg',
 'FeatureGroupName': 'heidelberg-mlops-fg',
 'RecordIdentifierFeatureName': 'id',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'kst_brutto',
   'FeatureType': 'Fractional'},
  {'FeatureName': 'sm', 'FeatureType': 'Fractional'},
  {'FeatureName': 'tm', 'FeatureType': 'Fractional'},
  {'FeatureName': 'cl', 'FeatureType': 'Fractional'},
  {'FeatureName': 'so3', 'FeatureType': 'Fractional'},
  {'FeatureName': 'k2o', 'FeatureType': 'Fractional'},
  {'FeatureName': 'na2o', 'FeatureType': 'Fractional'},
  {'FeatureName': 'south_kiln_feed_01om886__tph__avg',
   'FeatureType': 'Fractional'},
  {'FeatureName': 'south_kiln_feed_01om886__tph__max',
   'FeatureType': 'Fractional'},
  {'FeatureName': 'north_kiln_feed_01om885__tph__avg',
   'FeatureType': 'Fractional'},
  {'FeatureName': 'north_kiln_feed_01om885__tph__max',
   'FeatureType': 'Fractional'},
  {'FeatureNam

## Create an Amazon Kinesis Data Stream

In [None]:
kinesis_client.create_stream(StreamName=STREAM_NAME, ShardCount=1)

{'ResponseMetadata': {'RequestId': 'eb540470-5ef8-75f8-b33a-ad63000d5a46',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'eb540470-5ef8-75f8-b33a-ad63000d5a46',
   'x-amz-id-2': '1aiJ/8+8IRwrKGfUkHHq76P5bGilrC/E/1H2debFOePX6SzYGUx6ZtxUDNb2VhE5GyNx8j4xOI1P5F1eYnSIGhPI9iJZcD0+',
   'date': 'Tue, 5 Jan 2021 17:50:11 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0'},
  'RetryAttempts': 0}}

In [None]:
kinesis_client.list_streams()

{'StreamNames': ['Django-internal-error',
  'Foo',
  'cloudcrm-vdr-test',
  'cloudcrm_pivotal_kinesis_stream',
  'cn-support-vision-1-article-stream',
  'cn-support-vision-1-domain-stream',
  'cn-support-vision-2-dev-Stream',
  'csv-kba-summary',
  'heidelberg-mlops-stream',
  'model-stream',
  'real-time-analysis',
  'sococo-video-insights-stream',
  'solarwind-2-logs-stream',
  'test-data-stream',
  'topbox_ingestion_stream',
  'tu2k20_ashish.gupta1_kinesis',
  'tu2k20_shivangchopra11',
  'vdr-eventstream',
  'viswak-tu2k20'],
 'HasMoreStreams': False,
 'ResponseMetadata': {'RequestId': 'e5761aba-a456-6a2f-bd18-b3affaa34591',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e5761aba-a456-6a2f-bd18-b3affaa34591',
   'x-amz-id-2': 'KRaSoOl3qWuLe5kBwOTVAe8xgoI8Opl5Rowgy4Z77+4iE1R0N3C9Rn3alXqYoGqMNgj3qX1ra8mKu4RVzbHLN2BmJjK35P/8',
   'date': 'Tue, 5 Jan 2021 17:50:13 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '502'},
  'RetryAttempts': 0}}

In [None]:
kinesis_client.describe_stream(StreamName=STREAM_NAME)

{'StreamDescription': {'StreamName': 'heidelberg-mlops-stream',
  'StreamARN': 'arn:aws:kinesis:us-east-1:082830052325:stream/heidelberg-mlops-stream',
  'StreamStatus': 'CREATING',
  'Shards': [],
  'HasMoreShards': False,
  'RetentionPeriodHours': 24,
  'StreamCreationTimestamp': datetime.datetime(2021, 1, 5, 17, 50, 10, tzinfo=tzlocal()),
  'EnhancedMonitoring': [{'ShardLevelMetrics': []}],
  'EncryptionType': 'NONE'},
 'ResponseMetadata': {'RequestId': 'd70a0a13-2cea-0a49-8f64-a305721f25f7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd70a0a13-2cea-0a49-8f64-a305721f25f7',
   'x-amz-id-2': '2wHUeiUHaRjrgMJWgiaE+1mFf753aalkJwhrL2yP7djOHoEPJ30owbKhgXB+FdLymwwxupJumKkb9KLfBbRWW2Us8OQiBkjK',
   'date': 'Tue, 5 Jan 2021 17:50:14 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '343'},
  'RetryAttempts': 0}}

In [None]:
import time
active_stream = False
while not active_stream:
    status = kinesis_client.describe_stream(StreamName=STREAM_NAME)['StreamDescription']['StreamStatus']
    if (status == 'CREATING'):
        print('Waiting for the Kinesis stream to become active...')
        time.sleep(20)  
    elif (status == 'ACTIVE'): 
        active_stream = True
        print('ACTIVE')

Waiting for the Kinesis stream to become active...
ACTIVE


In [None]:
stream_arn = kinesis_client.describe_stream(StreamName=STREAM_NAME)['StreamDescription']['StreamARN']
stream_arn

'arn:aws:kinesis:us-east-1:082830052325:stream/heidelberg-mlops-stream'

## Map the Kinesis stream as an event source for Lambda Model detection

In [None]:
lambda_client.create_event_source_mapping(EventSourceArn=stream_arn,
                                          FunctionName=lambda_to_model_arn,
                                          StartingPosition='LATEST',
                                          Enabled=True,
                                          MaximumRecordAgeInSeconds=60*10
                                          ) 
#DestinationConfig would handle discarded records
# write your lambda name here

{'ResponseMetadata': {'RequestId': 'fb8d00e7-65e8-4433-90da-1b3d1bed683a',
  'HTTPStatusCode': 202,
  'HTTPHeaders': {'date': 'Tue, 05 Jan 2021 17:50:44 GMT',
   'content-type': 'application/json',
   'content-length': '821',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'fb8d00e7-65e8-4433-90da-1b3d1bed683a'},
  'RetryAttempts': 0},
 'UUID': '9974f2db-2d9a-4081-9e31-7cbb227ca548',
 'StartingPosition': 'LATEST',
 'BatchSize': 100,
 'MaximumBatchingWindowInSeconds': 0,
 'ParallelizationFactor': 1,
 'EventSourceArn': 'arn:aws:kinesis:us-east-1:082830052325:stream/heidelberg-mlops-stream',
 'FunctionArn': 'arn:aws:lambda:us-east-1:082830052325:function:HeidelbergInvokeFraudEndpointLambda3',
 'LastModified': datetime.datetime(2021, 1, 5, 17, 50, 44, 109000, tzinfo=tzlocal()),
 'LastProcessingResult': 'No records processed',
 'State': 'Creating',
 'StateTransitionReason': 'User action',
 'DestinationConfig': {'OnFailure': {}},
 'MaximumRecordAgeInSeconds': 600,
 'BisectBatchOnFuncti

## Create an Amazon Kinesis Data Applications (KDA) application

In [None]:
# sql_code_example = 'CREATE OR REPLACE STREAM "DESTINATION_SQL_STREAM" (\n' + \
#                 '"cc_num"              BIGINT,\n' + \
#                 '"num_trans_last_10m"  SMALLINT,\n' + \
#                 '"avg_amt_last_10m"    REAL\n);\n\n' + \
#             'CREATE OR REPLACE PUMP "STREAM_PUMP" AS\n' + \
#             'INSERT INTO "DESTINATION_SQL_STREAM"\n' + \
#                 'SELECT STREAM "cc_num", \n' + \
#                     'COUNT(*) OVER LAST_10_MINUTES, \n' + \
#                     'AVG("amount") OVER LAST_10_MINUTES\n' + \
#                     'FROM "SOURCE_SQL_STREAM_001"\n' + \
#                     'WINDOW LAST_10_MINUTES AS (\n' + \
#                         'PARTITION BY "cc_num"\n' + \
#                         'RANGE INTERVAL \'10\' MINUTE PRECEDING);\n'
# print(sql_code_example)



# sql_code = ""  # Store features from stream to featurestore

CREATE OR REPLACE STREAM "DESTINATION_SQL_STREAM" (
"cc_num"              BIGINT,
"num_trans_last_10m"  SMALLINT,
"avg_amt_last_10m"    REAL
);

CREATE OR REPLACE PUMP "STREAM_PUMP" AS
INSERT INTO "DESTINATION_SQL_STREAM"
SELECT STREAM "cc_num", 
COUNT(*) OVER LAST_10_MINUTES, 
AVG("amount") OVER LAST_10_MINUTES
FROM "SOURCE_SQL_STREAM_001"
WINDOW LAST_10_MINUTES AS (
PARTITION BY "cc_num"
RANGE INTERVAL '10' MINUTE PRECEDING);



#### Application Code (SQL QUERY) For KDA

In [None]:
def get_features():
    s3 = boto3.resource('s3')
    obj = s3.Object(s3_bucket, s3_prefix+'/'+s3_transformed_data_filename)
    body = obj.get()['Body'].read()
    data=pd.read_csv(io.BytesIO(body))
    del data['target']
    data = data.round(5)
    data=data.fillna(0)
    return list(data)

features = get_features()

In [None]:
def get_sql_code():
    s = 'CREATE OR REPLACE STREAM "DESTINATION_SQL_STREAM" (\n'
    for index,i in enumerate(features):
        if index!= len(features)-1:
            s+='"'+i+'"'+"      REAL,\n"
        else:
            s+='"'+i+'"'+"      REAL\n"
            
    s+='); \n\n'

    s+='CREATE OR REPLACE PUMP "STREAM_PUMP" AS\n INSERT INTO "DESTINATION_SQL_STREAM"\n'
    s+='SELECT STREAM '
    for index,i in enumerate(features):
        if index!= len(features)-1:
            s+='"'+i+'", \n'
        else:
            s+='"'+i+'" \n'
    s+='FROM "SOURCE_SQL_STREAM_001"\n'
    s+='WINDOW LAST_10_MINUTES AS (\n' + \
            'PARTITION BY "id"\n' + \
            'RANGE INTERVAL \'10\' MINUTE PRECEDING);\n'
    return s;

sql_code=get_sql_code()
print(sql_code)

CREATE OR REPLACE STREAM "DESTINATION_SQL_STREAM" (
"kst_brutto"      REAL,
"sm"      REAL,
"tm"      REAL,
"cl"      REAL,
"so3"      REAL,
"k2o"      REAL,
"na2o"      REAL,
"south_kiln_feed_01om886__tph__avg"      REAL,
"south_kiln_feed_01om886__tph__max"      REAL,
"north_kiln_feed_01om885__tph__avg"      REAL,
"north_kiln_feed_01om885__tph__max"      REAL,
"north_fan_speed_01oa943__rpm__avg"      REAL,
"north_fan_speed_01oa943__rpm__max"      REAL,
"south_fan_speed_02oa943__rpm__avg"      REAL,
"south_fan_speed_02oa943__rpm__max"      REAL,
"lignite_main_burner_03sk820__tph__avg"      REAL,
"lignite_main_burner_03sk820__tph__max"      REAL,
"bpg_main_burner_03bf810__tph__avg"      REAL,
"bpg_main_burner_03bf810__tph__max"      REAL,
"lignite_calciner_02sk820__tph__avg"      REAL,
"lignite_calciner_02sk820__tph__max"      REAL,
"bpg_calciner_02bf810__tph__avg"      REAL,
"bpg_calciner_02bf810__tph__max"      REAL,
"kbs_calciner_00kb950__tph__avg"      REAL,
"kbs_calciner_00kb950__t

In [None]:
def get_record_columns():
    s3 = boto3.resource('s3')
    obj = s3.Object(s3_bucket, s3_prefix+'/'+s3_transformed_data_filename)
    body = obj.get()['Body'].read()
    data=pd.read_csv(io.BytesIO(body))
    data = data.round(5)
    columns = list(data)
    record_columns = []
    for column in columns:
        if column!='target':
            temp = {'Name': column,  'Mapping': '$.'+column,   'SqlType': 'REAL'}
            record_columns.append(temp)
    return record_columns
record_columns  = get_record_columns()


In [None]:
kda_inputs = [{
                'NamePrefix': 'SOURCE_SQL_STREAM',
                'KinesisStreamsInput': {
                       'ResourceARN': stream_arn,
                       'RoleARN': role
                },
                'InputSchema': {
                      'RecordFormat': {
                          'RecordFormatType': 'JSON',
                          'MappingParameters': {
                              'JSONMappingParameters': {
                                  'RecordRowPath': '$'
                              }
                          },
                      },
                      'RecordEncoding': 'UTF-8',
                      'RecordColumns': record_columns
                }
              }                         
             ]

<h3> Create Kinesis Data Analytics Application </h3>

We first lookup Lambda ARNs from CloudFormation output, then create a Kinesis Data Analytics application that connects its output to the Streaming Lambda. This Lambda will ingest the records and write them to the SageMaker Feature Group.

In [None]:
kda_outputs = [{'LambdaOutput': {'ResourceARN': lambda_to_fs_arn, 'RoleARN': role},
                'Name': 'DESTINATION_SQL_STREAM',
                'DestinationSchema': {'RecordFormatType': 'JSON'}}]
kda_outputs

[{'LambdaOutput': {'ResourceARN': 'arn:aws:lambda:us-east-1:082830052325:function:HeidelbergStreamingIngestAggFeatures3',
   'RoleARN': 'arn:aws:iam::082830052325:role/MySagemakerRoleHeidelberg3'},
  'Name': 'DESTINATION_SQL_STREAM',
  'DestinationSchema': {'RecordFormatType': 'JSON'}}]

In [None]:
kda_client.create_application(ApplicationName=APPLICATION_NAME, 
                              Inputs=kda_inputs,
                              Outputs=kda_outputs,
                              ApplicationCode=sql_code)

{'ApplicationSummary': {'ApplicationName': 'heidelberg-mlops-app',
  'ApplicationARN': 'arn:aws:kinesisanalytics:us-east-1:082830052325:application/heidelberg-mlops-app',
  'ApplicationStatus': 'READY'},
 'ResponseMetadata': {'RequestId': 'd2eb591b-650e-4c3a-acb7-737c98783b23',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd2eb591b-650e-4c3a-acb7-737c98783b23',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '193',
   'date': 'Tue, 05 Jan 2021 17:51:13 GMT'},
  'RetryAttempts': 0}}

In [None]:
kda_client.describe_application(ApplicationName=APPLICATION_NAME)

{'ApplicationDetail': {'ApplicationName': 'heidelberg-mlops-app',
  'ApplicationARN': 'arn:aws:kinesisanalytics:us-east-1:082830052325:application/heidelberg-mlops-app',
  'ApplicationStatus': 'READY',
  'CreateTimestamp': datetime.datetime(2021, 1, 5, 17, 51, 14, tzinfo=tzlocal()),
  'LastUpdateTimestamp': datetime.datetime(2021, 1, 5, 17, 51, 14, tzinfo=tzlocal()),
  'InputDescriptions': [{'InputId': '1.1',
    'NamePrefix': 'SOURCE_SQL_STREAM',
    'InAppStreamNames': ['SOURCE_SQL_STREAM_001'],
    'KinesisStreamsInputDescription': {'ResourceARN': 'arn:aws:kinesis:us-east-1:082830052325:stream/heidelberg-mlops-stream',
     'RoleARN': 'arn:aws:iam::082830052325:role/MySagemakerRoleHeidelberg3'},
    'InputSchema': {'RecordFormat': {'RecordFormatType': 'JSON',
      'MappingParameters': {'JSONMappingParameters': {'RecordRowPath': '$'}}},
     'RecordEncoding': 'UTF-8',
     'RecordColumns': [{'Name': 'kst_brutto',
       'Mapping': '$.kst_brutto',
       'SqlType': 'REAL'},
      {'N

In [None]:
kda_client.start_application(ApplicationName=APPLICATION_NAME,
                             InputConfigurations=[{'Id': '1.1',
                                                   'InputStartingPositionConfiguration': 
                                                     {'InputStartingPosition':'NOW'}}])

{'ResponseMetadata': {'RequestId': '65d9eecd-8466-4fcb-8de5-88a6d0369662',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '65d9eecd-8466-4fcb-8de5-88a6d0369662',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'date': 'Tue, 05 Jan 2021 17:51:22 GMT'},
  'RetryAttempts': 0}}

# Training and Deploying the Model

#### 1. Train and Deploy the model (Hyperparameter Training)
#### 2. Get the endpoint of deployed model

### Split the data in train and val and train the model using XGBOOST 

In [None]:
s3 = boto3.resource('s3')
obj = s3.Object(s3_bucket,s3_prefix + '/' + s3_transformed_data_filename )
body = obj.get()['Body'].read()
df=pd.read_csv(io.BytesIO(body))

In [None]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(df, test_size=0.3)
train.to_csv(f'train.csv', header=False, index=False)
val.to_csv(f'val.csv', header=False, index=False)


In [None]:
!aws s3 cp train.csv s3://{s3_bucket}/{s3_prefix}/
!aws s3 cp val.csv s3://{s3_bucket}/{s3_prefix}/

upload: ./train.csv to s3://sagemaker-us-east-1-082830052325/heidelberg/data/train.csv
upload: ./val.csv to s3://sagemaker-us-east-1-082830052325/heidelberg/data/val.csv


In [None]:
from sagemaker.inputs import TrainingInput

# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:linear",
        "num_round":"100"}

output_path = 's3://{}/{}/output'.format(s3_bucket, s3_prefix)

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", sagemaker.Session().boto_region_name, "1.2-1")

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path)

# define the data type and paths to the training and validation datasets
content_type = "csv"
train_input = TrainingInput("s3://{}/{}/{}".format(s3_bucket, s3_prefix, 'train.csv'), content_type=content_type)
validation_input = TrainingInput("s3://{}/{}/{}".format(s3_bucket, s3_prefix, 'val.csv'), content_type=content_type)

# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})


2021-01-05 17:52:09 Starting - Starting the training job...
2021-01-05 17:52:35 Starting - Launching requested ML instancesProfilerReport-1609869129: InProgress
......
2021-01-05 17:53:36 Starting - Preparing the instances for training...
2021-01-05 17:54:05 Downloading - Downloading input data...
2021-01-05 17:54:38 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined d

In [None]:
predictor = estimator.deploy(
    initial_instance_count=1, 
    instance_type='ml.t2.medium',
    serializer=sagemaker.serializers.CSVSerializer(), wait=True)

---------------------!

In [None]:
endpoint_name=predictor.endpoint_name
#predictor.endpoint_name
#Store the endpoint name for later cleanup 
%store endpoint_name
endpoint_name

Stored 'endpoint_name' (str)


'sagemaker-xgboost-2021-01-05-17-55-28-234'

(TESTING PURPOSE) Now to check that our endpoint is working, let's call it directly with a record from our test hold-out set. 

In [None]:
# Testing code for endpoint
# Testing code for endpoint
payload_df = df.drop(['target'], axis=1)
payload = payload_df.head(1).to_csv(index=False, header=False).strip()
payload

'0.528384279,0.467741935,0.347826087,0.120689655,0.481481481,0.6521739129999999,0.222222222,0.89211593,0.815234731,0.891824174,0.826809847,0.818747008,0.790460807,0.845313157,0.8135465000000001,0.952188843,0.966913627,0.001595844,0.031735957,0.407821166,0.4427397,0.906654154,0.7025761309999999,0.0,0.0,0.6677419689999999,0.6071359479999999,0.6016636639999999,0.6008397089999999,0.6265824729999999,0.65160482,0.695344011,0.641205199,0.352858223,0.362426861,0.495575534,0.288251453,0.507363451,0.246757564,0.422013598,0.27402085600000003,0.7520242079999999,0.767547988,0.247975792,0.273029205,0.37539531,0.20433238,0.309886113,0.304407087,0.238270631,0.221075679,0.236838573,0.262799678,0.334076875,0.332396145,0.003750389,0.005742277,0.475591607,0.8703220690000001,0.449176795,0.299913568,0.321962498,0.322896068,0.317556703,0.320800742,0.312070381,0.313770855,0.313072295,0.31379601100000004,0.780272629,0.7256131259999999,0.533413581,0.561219567,0.567581641,0.5251290679999999,0.587453926,0.5971556

In [None]:
float(predictor.predict(payload).decode('utf-8'))

1.9875482320785522

In [None]:
!rm train.csv

In [None]:
!rm val.csv

# Making predictions using streaming aggregated features

Now,we will input data to our input Kinesis stream and show that we can detect output.

### Ensure Lambda knows which SageMaker endpoint to use
Our Lambda function that invokes the endpoint thus needs a way to know the endpoint name. We handle that through a Lambda environment variable.

This section of code simply takes care of updating end ENDPOINT_NAME Lambda environment variable. It is important to do so before we start feeding transactions into our Kinesis stream.

In [None]:
print(f'Updating Lambda to use endpoint: {endpoint_name} for ARN: {lambda_to_model_arn}')

variables = lambda_client.get_function_configuration(FunctionName=lambda_to_model_arn)['Environment']['Variables']
variables['ENDPOINT_NAME'] = endpoint_name
resp = lambda_client.update_function_configuration(
    FunctionName=lambda_to_model_arn,
      Environment={
        'Variables': variables
    }
)

Updating Lambda to use endpoint: sagemaker-xgboost-2021-01-05-17-55-28-234 for ARN: arn:aws:lambda:us-east-1:082830052325:function:HeidelbergInvokeFraudEndpointLambda3







## DEPLOY LINKS 

### Lambda functions
1. InvokeFraudEndpoint (https://console.aws.amazon.com/lambda/home?region=us-east-1#/functions/HeidelbergInvokeFraudEndpointLambda3?tab=configuration)
1. StreamingInjectAggregateFeature( https://console.aws.amazon.com/lambda/home?region=us-east-1#/functions/HeidelbergStreamingIngestAggFeatures3?tab=configuration)
1. Lambdatoputincomingdata ( https://console.aws.amazon.com/lambda/home?region=us-east-1#/functions/lambda_mlops?tab=configuration)

### Template (CLoudFormation Stack)

https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/template?filteringText=&filteringStatus=active&viewNested=true&hideStacks=false&stackId=arn%3Aaws%3Acloudformation%3Aus-east-1%3A082830052325%3Astack%2Fheidelberg-mlops-stack-3%2Ffc283870-49d8-11eb-b6cb-0ea6aa6eaed9

### LogStream (CloudWatch)

#### ENd point logstream,
https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/$252Faws$252Flambda$252FHeidelbergInvokeFraudEndpointLambda3


#### Update FeatureStore Logstream
https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/$252Faws$252Flambda$252FHeidelbergStreamingIngestAggFeatures3


#### Streaming lambda
https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/$252Faws$252Flambda$252Flambda_mlops


### KDA Application
https://console.aws.amazon.com/kinesisanalytics/home?region=us-east-1#/wizard/hub?applicationName=heidelberg-mlops-app


### Real time lambda update s3 bucket
https://s3.console.aws.amazon.com/s3/buckets/lambda-trigger-mlops?region=us-east-1&tab=objects