## Create an endpoint from the project

In [15]:
import datetime
import time
import tarfile
import os
import pickle

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput

import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer

from sklearn.base import TransformerMixin

sm_boto3 = boto3.client('sagemaker')

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print('Using bucket ' + bucket)

Using bucket sagemaker-ap-northeast-2-806174985048


In [17]:
!mkdir data

In [18]:
%%bash

wget -q https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
mv adult.data ./data/adult.csv

In [19]:
# Read csv with column names
column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
df = pd.read_csv('data/adult.csv', names = column_names)

df.replace('?',np.NaN,inplace=True)

df_train_val, df_test, = train_test_split(df, test_size=0.1, random_state=42)
df_train_val_no_target = df_train_val.drop('income', axis=1)

df_test.to_csv('data/test.csv', index=False)

In [21]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_ind = [i for i, x in enumerate(df_train_val_no_target.dtypes) if x != object]
cat_ind = [i for i, x in enumerate(df_train_val_no_target.dtypes) if x == object]

numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_ind),
    ('cat', categorical_transformer, cat_ind)
])

X = preprocessor.fit_transform(df_train_val_no_target)

y = LabelEncoder().fit_transform(df_train_val.income)
X = np.insert(X, 0, y, axis=1)

# Save the ColumnTransformer to be used during inference
with open('src/preprocess.pkl', 'wb') as f:
    pickle.dump(preprocessor, f) 

In [23]:
X_train, X_val, = train_test_split(X, test_size=0.2, random_state=42)

np.savetxt("data/train.csv", X_train, delimiter=",", fmt='%f')
np.savetxt("data/val.csv", X_val, delimiter=",", fmt='%f')

prefix = 'sagemaker/blog'

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv'))\
.upload_file('data/train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv'))\
.upload_file('data/val.csv')

In [24]:
container = sagemaker.image_uris.retrieve('xgboost', boto3.Session().region_name, '1.2-1')

hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"50"}

xgb = sagemaker.estimator.Estimator(container,
                                    get_execution_role(), 
                                    hyperparameters=hyperparameters,                                    
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2022-10-16 23:05:21 Starting - Starting the training job...
2022-10-16 23:05:45 Starting - Preparing the instances for trainingProfilerReport-1665961521: InProgress
.........
2022-10-16 23:07:10 Downloading - Downloading input data......
2022-10-16 23:08:06 Training - Downloading the training image......
2022-10-16 23:09:06 Training - Training image download completed. Training in progress.[34m[2022-10-16 23:09:10.557 ip-10-0-174-162.ap-northeast-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0

In [26]:
!mkdir model

mkdir: cannot create directory ‘model’: File exists


In [27]:
# Save the model to be used during inference
!aws s3 cp {xgb.model_data} model/

download: s3://sagemaker-ap-northeast-2-806174985048/sagemaker/blog/output/sagemaker-xgboost-2022-10-16-23-05-21-303/output/model.tar.gz to model/model.tar.gz
