# Sample of SageMaker-byoc-tf2-byos-deepctr-deepfm-on-SageMakerStudio-notebook

NOTE：Run this notebook in SageMaker Studio instead of SageMaker instance

Core requirements:
1. TensorFlow2.8+py3.8
2. BYOC
3. SageMaker+DeepCTR-DeepFM
4. BYOS in S3 for AirFlow usage
5. using FastFile mode to access dataset files in S3

Steps:
1. Generate Dockerfile
2. Generate requirements.txt
3. Generate train.py
4. Build docker image & upload to ECR
5. SageMaker setting
6. tar train.py requirements and upload to s3
7. [Mot Support] Local test image
8. [Not need this step]Upload container image to Amazon ECR
9. Start SageMaker training job
10. [TBD]Build a SageMaker pipeline

## 1. Generate Dockerfile

In [2]:
%%writefile Dockerfile
FROM tensorflow/tensorflow:2.8.4

# Install sagemaker-training toolkit that contains the common functionality necessary to create a container compatible with SageMaker and the Python SDK.
RUN /usr/bin/python3 -m pip install --upgrade pip
RUN pip3 install sagemaker-training && pip3 install scikit-learn && pip3 install pandas
#RUN pip3 install -U scikit-learn

# Copies the training code inside the container
#COPY train.py /opt/ml/code/train.py

# Defines train.py as script entrypoint
#ENV SAGEMAKER_PROGRAM train.py
WORKDIR /opt/ml/code

Overwriting Dockerfile


## 2. Generate requirements.txt

In [3]:
%%writefile requirements.txt
deepctr

Overwriting requirements.txt


## 3. Generate train.py

In [4]:
%%writefile train_fastfile.py
import pandas as pd
import tensorflow as tf
import os
import argparse
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.models import *
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

#     parser.add_argument("--learning-rate", type=float, default=0.01)
#     parser.add_argument("--batch-size", type=int, default=128)
#     parser.add_argument("--batch-norm", type=bool, default=False)
#     parser.add_argument("--dnn-hidden-units", type=str, default="128,64,32")
#     parser.add_argument("--dropout-rate", type=float, default=0.0)

#     parser.add_argument("--checkpoint", type=str, default=None)
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))

    args, _ = parser.parse_known_args()
    
    
    data = pd.read_csv('/opt/ml/input/data/train/criteo_sample.txt')

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=4)
                              for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1, )
                                                                            for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2, random_state=2020)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model.compile("adam", "binary_crossentropy",
                  metrics=['binary_crossentropy'], )

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
    model.summary()
    model.save('/opt/ml/model/deepctr-deepfm')
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))


Overwriting train_fastfile.py


## 4. Build docker image

In [5]:
%%time
#algorithm_name='byoc1'
#docker build -t ${algorithm_name} .
!pip install sagemaker-studio-image-build



[0mCPU times: user 56.3 ms, sys: 9.28 ms, total: 65.6 ms
Wall time: 3.47 s


In [6]:
%%time
!sm-docker build . --repository byoc1:latest

...[Container] 2023/01/17 04:38:57 going inside waitForAgent

[Container] 2023/01/17 04:38:57 Waiting for agent ping
[Container] 2023/01/17 04:38:58 Waiting for DOWNLOAD_SOURCE
[Container] 2023/01/17 04:39:01 Phase is DOWNLOAD_SOURCE
[Container] 2023/01/17 04:39:01 finished waitForAgent
[Container] 2023/01/17 04:39:01 CODEBUILD_SRC_DIR=/codebuild/output/src225897697/src
[Container] 2023/01/17 04:39:01 YAML location is /codebuild/output/src225897697/src/buildspec.yml
[Container] 2023/01/17 04:39:01 Setting HTTP client timeout to higher timeout for S3 source
[Container] 2023/01/17 04:39:01 Processing environment variables
[Container] 2023/01/17 04:39:01 No runtime version selected in buildspec.
[Container] 2023/01/17 04:39:01 Moving to directory /codebuild/output/src225897697/src
[Container] 2023/01/17 04:39:01 Configuring ssm agent with target id: codebuild:26e3a881-a5e4-46da-95e0-777a69b2c371
[Container] 2023/01/17 04:39:01 Successfully updated ssm agent configuration
[Container] 2023/

## 5. SageMaker setting

In [7]:
%%time
#! python3 -m pip install --upgrade sagemaker
import sagemaker
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
import boto3

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = "byoc1"

role = (
    get_execution_role()
)  # provide a pre-existing role ARN as an alternative to creating a new role
role_name = role.split(["/"][-1])
print(f"SageMaker Execution Role: {role}")
print(f"The name of the Execution role: {role_name[-1]}")

client = boto3.client("sts")
account = client.get_caller_identity()["Account"]
print(f"AWS account: {account}")

session = boto3.session.Session()
region = session.region_name
print(f"AWS region: {region}")

SageMaker Execution Role: arn:aws:iam::432088571089:role/AmazonSageMaker-ExecutionRole-20210324T123126
The name of the Execution role: AmazonSageMaker-ExecutionRole-20210324T123126
AWS account: 432088571089
AWS region: us-east-1
CPU times: user 1.03 s, sys: 231 ms, total: 1.26 s
Wall time: 1.44 s


## 6. tar train.py requirements and upload to s3

In [8]:
!tar zcvf train.tar.gz train_fastfile.py train_tf_data.py requirements.txt
source_dir_s3=sagemaker_session.upload_data(path='train.tar.gz', bucket=bucket, key_prefix=prefix)
print(source_dir_s3)

train_fastfile.py
tar: train_tf_data.py: Cannot stat: No such file or directory
requirements.txt
tar: Exiting with failure status due to previous errors
s3://sagemaker-us-east-1-432088571089/byoc1/train.tar.gz


## 7. [Mot Support] Local test image

dataset file can be downloaded from this site
https://github.com/shenweichen/DeepCTR/blob/master/examples/criteo_sample.txt, then upload to the proper directory of your s3 bucket

In [9]:
# from sagemaker.estimator import Estimator
# from sagemaker.inputs import TrainingInput
# import time

# algorithm_name='byoc1'
# dataset_path="datasets/deepctr/"

# train_channel=TrainingInput(
#         s3_data=f's3://{bucket}/{dataset_path}',
#         input_mode='FastFile'  # Available options: File | Pipe | FastFile
#     )

# estimator = Estimator(image_uri=f'{algorithm_name}:latest',
#                       role=role,
#                       entry_point='train_fastfile.py',
#                       source_dir=source_dir_s3,#'.',
#                       instance_count=1,
#                       instance_type='local')

# estimator.fit(inputs={"train":train_channel},
#               job_name="hstong-"+time.strftime("%Y%m%d%H%M%S", time.localtime()))

## 8. [Not need this step]Upload container image to Amazon ECR

In [10]:
# %%sh

# # Specify an algorithm name
# algorithm_name='byoc1'

# account=$(aws sts get-caller-identity --query Account --output text)

# # Get the region defined in the current configuration (default to us-west-2 if none defined)
# region=$(aws configure get region)
# region=${region:-us-west-2}

# fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# # If the repository doesn't exist in ECR, create it.

# aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1
# if [ $? -ne 0 ]
# then
# aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
# fi

# # Get the login command from ECR and execute it directly

# aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# # Build the docker image locally with the image name and then push it to ECR
# # with the full name.

# docker build -t ${algorithm_name} .
# docker tag ${algorithm_name} ${fullname}

# docker push ${fullname}



# 9. Start SageMaker training job

In [14]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
import time

# please modify this byoc_image_uri variable
byoc_image_uri='432088571089.dkr.ecr.us-east-1.amazonaws.com/byoc1:latest'
algorithm_name='byoc1'
dataset_path="datasets/deepctr/"

train_channel=TrainingInput(
        s3_data=f's3://{bucket}/{dataset_path}',
        input_mode='FastFile'  # Available options: File | Pipe | FastFile
    )

estimator = Estimator(image_uri=byoc_image_uri,
                      role=role,
                      entry_point='train_fastfile.py',
                      source_dir=source_dir_s3,#'.',
                      instance_count=1,
                      instance_type='ml.c5.xlarge')

estimator.fit(inputs={"train":train_channel},
              job_name="hstong-"+time.strftime("%Y%m%d%H%M%S", time.localtime()))

2023-01-17 04:50:10 Starting - Starting the training job...
2023-01-17 04:50:38 Starting - Preparing the instances for trainingProfilerReport-1673931010: InProgress
......
2023-01-17 04:51:41 Downloading - Downloading input data
2023-01-17 04:51:41 Training - Training image download completed. Training in progress...[34m2023-01-17 04:51:46,567 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/usr/bin/python3 -m pip install -r requirements.txt[0m
[34mCollecting deepctr
  Downloading deepctr-0.9.3-py3-none-any.whl (141 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 141.2/141.2 kB 21.5 MB/s eta 0:00:00[0m
[34mCollecting h5py==2.10.0
  Downloading h5py-2.10.0-cp38-cp38-manylinux1_x86_64.whl (2.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.9/2.9 MB 108.3 MB/s eta 0:00:00[0m
[34mInstalling collected packages: h5py, deepctr
  Attempting uninstall: h5py
    Found existing installation: h5py 3.7.0
    Uninstalling h5py-3.7.0:
      Succ