## Environment Setup

Place your conda yaml in the below section

In [None]:
name: xgb-customer-churn
channels:
  - defaults
  - anaconda
  - conda-forge
dependencies:
  - python=3.6
  - xgboost
  - pip
  - pip:
      - mlflow>=1.6.0
      - matplotlib
      - boto3
      - smdebug

In [1]:
tracking_uri = "http://107.22.99.180/"
max_depth = 5
eta = 0.2
gamma = 4
min_child_weight = 6
eta = 0.2
subsample = 0.8
silent = 0
objective = "binary:logistic"
num_round = 200

## Place Imports 

Place all the imports that you are aware of here.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
import boto3
import re


import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig
from sagemaker.model_monitor import DataCaptureConfig, DatasetFormat, DefaultModelMonitor
from sagemaker.s3 import S3Uploader, S3Downloader

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

## Utility Functions

Place the standalone functions in the below section

In [39]:
def fetch_data(local_data_path):
    data = pd.read_csv(local_data_path)
    pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
    pd.set_option('display.max_rows', 10)         # Keep the output on one page
    return data

In [None]:
def create_smdebug_hook(out_dir, train_data=None, validation_data=None, frequency=1, collections=None,):

    save_config = SaveConfig(save_interval=frequency)
    hook = Hook(
        out_dir=out_dir,
        train_data=train_data,
        validation_data=validation_data,
        save_config=save_config,
        include_collections=collections,
    )

    return hook

In [None]:
def model_fn(model_dir):
    """Load a model. For XGBoost Framework, a default function to load a model is not provided.
    Users should provide customized model_fn() in script.
    Args:
        model_dir: a directory where model is saved.
    Returns:
        A XGBoost model.
        XGBoost model format type.
    """
    model_files = (file for file in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, file)))
    model_file = next(model_files)
    try:
        booster = pickle.load(open(os.path.join(model_dir, model_file), 'rb'))
        format = 'pkl_format'
    except Exception as exp_pkl:
        try:
            booster = xgboost.Booster()
            booster.load_model(os.path.join(model_dir, model_file))
            format = 'xgb_format'
        except Exception as exp_xgb:
            print("Unable to load model: {} {}".format(str(exp_pkl), str(exp_xgb)))
            import sys
            sys.exit(-1)
    booster.set_param('nthread', 1)
    return booster, format

In [None]:
def parse_args():

    parser = argparse.ArgumentParser()

    parser.add_argument("--max-depth", type=int, default=5)
    parser.add_argument("--eta", type=float, default=0.2)
    parser.add_argument("--gamma", type=int, default=4)
    parser.add_argument("--min-child-weight", type=int, default=6)
    parser.add_argument("--subsample", type=float, default=0.8)
    parser.add_argument("--silent", type=int, default=0)
    parser.add_argument("--objective", type=str, default="binary:logistic")
    parser.add_argument("--num-round", type=int, default=50)
    parser.add_argument("--smdebug-path", type=str, default=None)
    parser.add_argument("--smdebug-frequency", type=int, default=1)
    parser.add_argument("--smdebug-collections", type=str, default='metrics')
    parser.add_argument("--output-uri", type=str, default="/opt/ml/output/tensors",
                        help="S3 URI of the bucket where tensor data will be stored.")

    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR', '/opt/ml/model'))

    print('\n-------------- Environment Variables -------------\n')
    for key, value in os.environ.items():
        print('{}={}'.format(key, value))

    args = parser.parse_args()

    return args

## Data Cleaning

In [93]:
def preprocessing(input_path):
    pass

## Visualize total data

Place all of the plots in the following section pertaining to the data

In [None]:
plot.show()

## Training Code

Place training code here

In [2]:
def run_training():
    args = parse_args()
    # enable auto logging
    mlflow.xgboost.autolog()
    train, validation = args.train, args.validation
    parse_csv = "?format=csv&label_column=0"
    dtrain = xgboost.DMatrix(train+parse_csv)
    dval = xgboost.DMatrix(validation+parse_csv)

    # enable auto logging
    mlflow.xgboost.autolog()

    watchlist = [(dtrain, "train"), (dval, "validation")]
    with mlflow.start_run():
        params = {
            "max_depth": args.max_depth,
            "eta": args.eta,
            "gamma": args.gamma,
            "min_child_weight": args.min_child_weight,
            "subsample": args.subsample,
            "silent": args.silent,
            "objective": args.objective}

        # The output_uri is a the URI for the s3 bucket where the metrics will be
        # saved.
        output_uri = (
            args.smdebug_path
            if args.smdebug_path is not None
            else args.output_uri
        )

        collections = (
            args.smdebug_collections.split(',')
            if args.smdebug_collections is not None
            else None
        )

        hook = create_smdebug_hook(
            out_dir=output_uri,
            frequency=args.smdebug_frequency,
            collections=collections,
            train_data=dtrain,
            validation_data=dval,
        )

        model = xgboost.train(
            params=params,
            dtrain=dtrain,
            evals=watchlist,
            num_boost_round=args.num_round,
            callbacks=[hook])

        if not os.path.exists(args.model_dir):
            os.makedirs(args.model_dir)

        model_location = os.path.join(args.model_dir, 'xgboost-model')
        pickle.dump(model, open(model_location, 'wb'))

        mlflow.xgboost.log_model(model, 'model', registered_model_name='xgb-customer-churn')

## Run Training

In [None]:
warnings.filterwarnings("ignore")
np.random.seed(40)
try:
    run_training()
except Exception as e:
    logger.exception("Unable to execute training", e)

In [2]:
%env AWS_DEFAULT_PROFILE=data-science
%env AWS_PROFILE=data-science
%env AWS_EB_PROFILE=data-science
%env AWS_ACCESS_KEY_ID=ASIA6KLLBVQ7OX2XX2MI
%env AWS_SECRET_ACCESS_KEY=NssYpDHwxL0O91UuYa02VTikjt+Kw5BvKJT+fMv6
%env AWS_SESSION_TOKEN=FwoGZXIvYXdzEIv//////////wEaDKLNSR3ud7tt4oLlhSKsAfntfEGxMzCn7T6uPxv9MaoCjfHkl2F46pzsETBw0iSLhhB8URFDpv0hh/PxOqqNQBIESyPZ/Yy79OLxTWa5E52zC+IjVNlGDqUCa+4f0N+wP+eNOpZs0aQT0nd965vWmxm5F81X0/YG5hK/5Kn9ywLzwlX68SZr3WCmRPErnlxDN9PN/tnStvlear2xEl7JAODLNJy9ej4PjLIrLdO2CI/PaXfmxruSvxGS/80olNKP+QUyLcRZ+UNwxa+dZn4HelKrPcEDg/CcWgzn442ohrTcS0MwpQJ7w8BchdHTPfGrHQ==

env: AWS_DEFAULT_PROFILE=data-science
env: AWS_PROFILE=data-science
env: AWS_EB_PROFILE=data-science
env: AWS_ACCESS_KEY_ID=ASIA6KLLBVQ7OX2XX2MI
env: AWS_SECRET_ACCESS_KEY=NssYpDHwxL0O91UuYa02VTikjt+Kw5BvKJT+fMv6
env: AWS_SESSION_TOKEN=FwoGZXIvYXdzEIv//////////wEaDKLNSR3ud7tt4oLlhSKsAfntfEGxMzCn7T6uPxv9MaoCjfHkl2F46pzsETBw0iSLhhB8URFDpv0hh/PxOqqNQBIESyPZ/Yy79OLxTWa5E52zC+IjVNlGDqUCa+4f0N+wP+eNOpZs0aQT0nd965vWmxm5F81X0/YG5hK/5Kn9ywLzwlX68SZr3WCmRPErnlxDN9PN/tnStvlear2xEl7JAODLNJy9ej4PjLIrLdO2CI/PaXfmxruSvxGS/80olNKP+QUyLcRZ+UNwxa+dZn4HelKrPcEDg/CcWgzn442ohrTcS0MwpQJ7w8BchdHTPfGrHQ==


In [29]:
import sagemaker
import boto3

from sagemaker.predictor import csv_serializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV

import sagemaker
from sagemaker import get_execution_role
sess = sagemaker.Session()
role = get_execution_role()
endpoint_name = 'xgb-customer-churn-v3'


predictor = RealTimePredictor(endpoint=endpoint_name, sagemaker_session=sess, serializer=csv_serializer, deserializer=None,
                                content_type=CONTENT_TYPE_CSV)

predictor.accept = 'text/csv'
with open('test-data/test_sample.csv', 'r') as f:
    for row in f:
        payload = row.strip('\n')
        response = predictor.predict(data=payload)
        time.sleep(0.5)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from mfs-xgb-customer-churn-v3-model-mpydfwhfrcau9ede9gvbtg with message "{"error_code": "BAD_REQUEST", "message": "Encountered an unexpected error while evaluating the model. Verify that the serialized input Dataframe is compatible with the model for inference.", "stack_trace": "Traceback (most recent call last):\n  File \"/miniconda/envs/custom_env/lib/python3.6/site-packages/mlflow/pyfunc/scoring_server/__init__.py\", line 196, in transformation\n    raw_predictions = model.predict(data)\n  File \"/miniconda/envs/custom_env/lib/python3.6/site-packages/mlflow/xgboost.py\", line 186, in predict\n    return self.xgb_model.predict(xgb.DMatrix(dataframe))\n  File \"/miniconda/envs/custom_env/lib/python3.6/site-packages/xgboost/core.py\", line 520, in __init__\n    data, feature_names, feature_types\n  File \"/miniconda/envs/custom_env/lib/python3.6/site-packages/xgboost/core.py\", line 420, in _convert_dataframes\n    meta_type)\n  File \"/miniconda/envs/custom_env/lib/python3.6/site-packages/xgboost/core.py\", line 294, in _maybe_pandas_data\n    raise ValueError(msg + ', '.join(bad_fields))\nValueError: DataFrame.dtypes for data must be int, float or bool.\n                Did not expect the data types in fields 132, 25, 113.2, 96, 269.9, 107, 229.1, 87, 7.1, 7, 2, 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 1, 0.19, 0.20, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.40, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 1.1, 0.51, 1.2, 0.52, 0.53, 1.3\n"}". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/xgb-customer-churn-v3 in account 984308886590 for more information.

In [25]:
from io import StringIO
import pandas as pd
csv_input = StringIO(payload)
data = pd.read_csv(csv_input)
import xgboost as xgb
xgb.DMatrix(data)

In [31]:
import xgboost as xgb
xgb.DMatrix(dataframe)

XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/Users/jerry/.virtualenvs/jupyter/lib/python3.7/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/jerry/.virtualenvs/jupyter/lib/python3.7/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


In [18]:
!head 'test-data/test_sample.csv'

186,0.1,137.8,97,187.7,118,146.4,85,8.7,6,1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,1.1,0.18,0.19,0.20,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.30,0.31,0.32,0.33,0.34,0.35,0.36,0.37,0.38,0.39,0.40,0.41,0.42,0.43,0.44,0.45,0.46,0.47,0.48,0.49,0.50,0.51,0.52,0.53,1.2,1.3,0.54,1.4,0.55
132,25,113.2,96,269.9,107,229.1,87,7.1,7,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1
112,17,183.2,95,252.8,125,156.7,95,9.7,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1
91,24,93.5,112,183.4,128,240.7,133,9.9,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1
22,0,110.3,107,166.5,93,202.3,96,9.5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0
102,0,186.8,92,173.7,123,250.9,131,9.7,

![CodePipeline](codepipeline.png "Sample CodePipeline View")