## Environment Setup

Place your conda yaml in the below section

In [None]:
name: xgb-customer-churn
channels:
  - defaults
  - anaconda
  - conda-forge
dependencies:
  - python=3.6
  - xgboost
  - pip
  - pip:
      - mlflow>=1.6.0
      - matplotlib
      - boto3
      - smdebug

In [1]:
tracking_uri = "http://107.22.99.180/"
max_depth = 5
eta = 0.2
gamma = 4
min_child_weight = 6
eta = 0.2
subsample = 0.8
silent = 0
objective = "binary:logistic"
num_round = 200

## Place Imports 

Place all the imports that you are aware of here.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
import boto3
import re


import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig
from sagemaker.model_monitor import DataCaptureConfig, DatasetFormat, DefaultModelMonitor
from sagemaker.s3 import S3Uploader, S3Downloader

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

## Utility Functions

Place the standalone functions in the below section

In [39]:
def fetch_data(local_data_path):
    data = pd.read_csv(local_data_path)
    pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
    pd.set_option('display.max_rows', 10)         # Keep the output on one page
    return data

In [None]:
def create_smdebug_hook(out_dir, train_data=None, validation_data=None, frequency=1, collections=None,):

    save_config = SaveConfig(save_interval=frequency)
    hook = Hook(
        out_dir=out_dir,
        train_data=train_data,
        validation_data=validation_data,
        save_config=save_config,
        include_collections=collections,
    )

    return hook

In [None]:
def model_fn(model_dir):
    """Load a model. For XGBoost Framework, a default function to load a model is not provided.
    Users should provide customized model_fn() in script.
    Args:
        model_dir: a directory where model is saved.
    Returns:
        A XGBoost model.
        XGBoost model format type.
    """
    model_files = (file for file in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, file)))
    model_file = next(model_files)
    try:
        booster = pickle.load(open(os.path.join(model_dir, model_file), 'rb'))
        format = 'pkl_format'
    except Exception as exp_pkl:
        try:
            booster = xgboost.Booster()
            booster.load_model(os.path.join(model_dir, model_file))
            format = 'xgb_format'
        except Exception as exp_xgb:
            print("Unable to load model: {} {}".format(str(exp_pkl), str(exp_xgb)))
            import sys
            sys.exit(-1)
    booster.set_param('nthread', 1)
    return booster, format

In [None]:
def parse_args():

    parser = argparse.ArgumentParser()

    parser.add_argument("--max-depth", type=int, default=5)
    parser.add_argument("--eta", type=float, default=0.2)
    parser.add_argument("--gamma", type=int, default=4)
    parser.add_argument("--min-child-weight", type=int, default=6)
    parser.add_argument("--subsample", type=float, default=0.8)
    parser.add_argument("--silent", type=int, default=0)
    parser.add_argument("--objective", type=str, default="binary:logistic")
    parser.add_argument("--num-round", type=int, default=50)
    parser.add_argument("--smdebug-path", type=str, default=None)
    parser.add_argument("--smdebug-frequency", type=int, default=1)
    parser.add_argument("--smdebug-collections", type=str, default='metrics')
    parser.add_argument("--output-uri", type=str, default="/opt/ml/output/tensors",
                        help="S3 URI of the bucket where tensor data will be stored.")

    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR', '/opt/ml/model'))

    print('\n-------------- Environment Variables -------------\n')
    for key, value in os.environ.items():
        print('{}={}'.format(key, value))

    args = parser.parse_args()

    return args

## Data Cleaning

In [93]:
def preprocessing(input_path):
    pass

## Visualize total data

Place all of the plots in the following section pertaining to the data

In [None]:
plot.show()

## Training Code

Place training code here

In [2]:
def run_training():
    args = parse_args()
    # enable auto logging
    mlflow.xgboost.autolog()
    train, validation = args.train, args.validation
    parse_csv = "?format=csv&label_column=0"
    dtrain = xgboost.DMatrix(train+parse_csv)
    dval = xgboost.DMatrix(validation+parse_csv)

    # enable auto logging
    mlflow.xgboost.autolog()

    watchlist = [(dtrain, "train"), (dval, "validation")]
    with mlflow.start_run():
        params = {
            "max_depth": args.max_depth,
            "eta": args.eta,
            "gamma": args.gamma,
            "min_child_weight": args.min_child_weight,
            "subsample": args.subsample,
            "silent": args.silent,
            "objective": args.objective}

        # The output_uri is a the URI for the s3 bucket where the metrics will be
        # saved.
        output_uri = (
            args.smdebug_path
            if args.smdebug_path is not None
            else args.output_uri
        )

        collections = (
            args.smdebug_collections.split(',')
            if args.smdebug_collections is not None
            else None
        )

        hook = create_smdebug_hook(
            out_dir=output_uri,
            frequency=args.smdebug_frequency,
            collections=collections,
            train_data=dtrain,
            validation_data=dval,
        )

        model = xgboost.train(
            params=params,
            dtrain=dtrain,
            evals=watchlist,
            num_boost_round=args.num_round,
            callbacks=[hook])

        if not os.path.exists(args.model_dir):
            os.makedirs(args.model_dir)

        model_location = os.path.join(args.model_dir, 'xgboost-model')
        pickle.dump(model, open(model_location, 'wb'))

        mlflow.xgboost.log_model(model, 'model', registered_model_name='xgb-customer-churn')

## Run Training

In [None]:
warnings.filterwarnings("ignore")
np.random.seed(40)
try:
    run_training()
except Exception as e:
    logger.exception("Unable to execute training", e)

![CodePipeline](codepipeline.png "Sample CodePipeline View")