# Trainning

## hyper-parameter

In [2]:
# hyper-parameters

import os
import datetime
import sagemaker
from sagemaker.utils import sagemaker_timestamp
from sagemaker.tensorflow import TensorFlow
from sagemaker import get_execution_role


sess = sagemaker.Session()
role = get_execution_role()

pretrain_date_str = ""
pretrain_job_name = ""
train_date_str = "training_dataset/"
test_date_str = "test_dataset/"

class config(object):
    # Common parameters
    TAGS = [{'Key': 'Service', 'Value': 'SageMaker'}]
    BUCKET_NAME = 'bucket_name'
    PREFIX_PATH = 'prefix_path'
    
    # Training parameters
    TRAIN_JOB_NAME = 'YOYI-training-lixu'
    AI_EPOCHS = 10
    AI_BATCH_SIZE = 8192
    AI_LR = 0.001
    AI_L2_REG = 0
    SM_TRAIN_EC2_TYPE = 'ml.m5.2xlarge'
    SM_TRAIN_EC2_NUM = 20
    PROCESSES_PER_HOST = 8
    TRAIN_CODE_PATH = 'code'
    SM_IAM_ROLE = role
    TRAIN_VOL_SIZE = 8
    TRAIN_DATA_SIZE = 363447989
    VALIDATION_DATA_SIZE = 2 * AI_BATCH_SIZE
    ALGO_B_START = 0
    ALGO_B_LIMIT = 120
    ALGO_B_DELTA = 1

## trainning

In [3]:
### Training ###

import os
import logging
import sagemaker
from sagemaker.utils import sagemaker_timestamp
from sagemaker.tensorflow import TensorFlow
from sagemaker import get_execution_role


hyperparameters = {
    'epochs': config.AI_EPOCHS,
    'batch_size': config.AI_BATCH_SIZE,
    'learning_rate': config.AI_LR,
    'l2_reg': config.AI_L2_REG,
    'process_per_host': config.PROCESSES_PER_HOST,
    'training_data_size': config.TRAIN_DATA_SIZE,
    'validation_data_size': config.VALIDATION_DATA_SIZE,
    'b_start': config.ALGO_B_START,
    'b_limit': config.ALGO_B_LIMIT,
    'b_delta': config.ALGO_B_DELTA,
    'base_job_name': config.TRAIN_JOB_NAME
}

metric_definitions = [
    {'Name': 'loss', 'Regex': 'loss: (\d+\.\d+)'},
    {'Name': 'val_loss', 'Regex': 'val_loss: (\d+\.\d+)'}
]

train_path = os.path.join(
    "s3://",
    config.BUCKET_NAME,
    config.PREFIX_PATH,
    train_date_str
)

code_location = os.path.join(
    "s3://",
    config.BUCKET_NAME,
    config.PREFIX_PATH,
    "custom_code_upload"
)

model_uri = os.path.join(
    "s3://",
    config.BUCKET_NAME,
    config.PREFIX_PATH,
    "model",
    "date={}".format(pretrain_date_str),
    "{}".format(pretrain_job_name),
    "output",
    "model.tar.gz"
)

output_path = os.path.join(
    "s3://",
    config.BUCKET_NAME,
    config.PREFIX_PATH,
    "model",
    "output_" + train_date_str
)

train_input = sagemaker.session.s3_input(
    s3_data=train_path,
    distribution="ShardedByS3Key" if (config.SM_TRAIN_EC2_TYPE != "local") else "FullyReplicated"
)

inputs = {
    'train': train_input
}

estimator = TensorFlow(
    entry_point="training-textline_rag_hash_emb.py",
    source_dir=config.TRAIN_CODE_PATH,
    code_location=code_location,
    role=config.SM_IAM_ROLE,
    train_instance_count=config.SM_TRAIN_EC2_NUM,
    train_instance_type =config.SM_TRAIN_EC2_TYPE,
    hyperparameters=hyperparameters,
    metric_definitions=metric_definitions,
    train_volume_size=config.TRAIN_VOL_SIZE,
    distributions={'mpi': {'enabled': True}},
    script_mode=True,
    framework_version='2.4.1',
    py_version='py37',
    train_use_spot_instances=(config.SM_TRAIN_EC2_TYPE != "local"),
    train_max_run=6 * 3600,
    train_max_wait=6 * 3600,
    tags=config.TAGS,
    output_path=output_path,
    container_log_level=logging.ERROR,
    base_job_name=config.TRAIN_JOB_NAME
)

estimator.fit(inputs)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
Parameter distribution will be renamed to {'mpi': {'enabled': True}} in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2021-08-11 13:46:29 Starting - Starting the training job...
2021-08-11 13:46:31 Starting - Launching requested ML instances......
2021-08-11 13:47:46 Starting - Preparing the instances for training............
2021-08-11 13:49:40 Downloading - Downloading input data.........
2021-08-11 13:51:20 Training - Downloading the training image..[32m2021-08-11 13:51:37.534560: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[32m2021-08-11 13:51:37.541995: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[32m2021-08-11 13:51:37.721030: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[32m2021-08-11 13:51:36.324866: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[32m2021-08-11 13:51:3

In [5]:
os.system('rm -r model/')
os.system('aws s3 cp ' + estimator.model_data + ' ./model/model.tar.gz')
os.system('tar -zxvf ./model/model.tar.gz -C ./model')

0

# Testing

## load model - RaggedTensor

In [8]:
# [Optional] - LOAD MODEL - RaggedTensor


from __future__ import print_function
import argparse
import os
import sys
import math
import logging
import numpy as np
import codecs
import json
import glob
import pandas as pd
from collections import defaultdict
from io import StringIO
from operator import add
import boto3
from subprocess import check_output
from datetime import datetime
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.python.framework import ops
from tensorflow.python.ops import math_ops
from tensorflow.keras import Model, Sequential
from tensorflow.keras import Input
from tensorflow.keras import layers
from tensorflow.keras.layers import DenseFeatures
from tensorflow.keras.layers import Reshape, Flatten, Concatenate, RepeatVector, Add, Subtract, Multiply, Dot, PReLU, Softmax, Activation
from tensorflow.keras.layers import Dense, Lambda, Embedding, LocallyConnected1D, Permute, Dropout
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPool1D, LSTM, GRU
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras.losses import MeanAbsoluteError, MeanSquaredError
from tensorflow.keras.losses import MeanSquaredLogarithmicError, MeanAbsolutePercentageError, BinaryCrossentropy
from tensorflow.keras.metrics import MAE, MSE, MSLE, MAPE
import horovod.tensorflow.keras as hvd


B_START = config.ALGO_B_START
B_LIMIT = config.ALGO_B_LIMIT
B_DELTA = config.ALGO_B_DELTA

def neighbourhood_likelihood_loss(y_true, y_pred):
    return 0.

def get_model(learning_rate, l2_reg, b_start, b_limit, b_delta):
    price_bucket_num = int(math.floor((b_limit - b_start + K.epsilon()) / b_delta))
    
    ### Input Layers ###
    input_tensor = Input(shape=(None,), ragged=True, dtype='int64', name='features')

    ### Embedding Layers ###
    input_embed_tensor = Embedding(1024, 10, name='x_Embedding')(input_tensor)
    input_embed_tensor = Lambda(lambda x: tf.math.reduce_mean(x, axis=1))(input_embed_tensor)
    
    ### 1-Order Feature Extractor ###
    x_o1_tensor = Dense(10, activation='relu', kernel_regularizer=regularizers.l2(l2_reg), name='o1_Dense_1')(input_embed_tensor)
    ### High-Order Feature Extractor ###
    x_oh_tensor = Dense(10, activation='relu', kernel_regularizer=regularizers.l2(l2_reg), name='oh_Dense_1')(x_o1_tensor)
    x_oh_tensor = Dense(10, activation='relu', kernel_regularizer=regularizers.l2(l2_reg), name='oh_Dense_2')(x_oh_tensor)
    
    ### Output Layer ###
    output_tensor = Concatenate()([x_o1_tensor, x_oh_tensor])
    output_tensor = Dense(price_bucket_num, kernel_regularizer=regularizers.l2(l2_reg), name='concat_Dense')(output_tensor)
    output_tensor = Softmax(name='ground_truth')(output_tensor)
    
    model = Model(inputs=input_tensor, 
                  outputs=output_tensor)
    optimizer = optimizers.Adam(lr=learning_rate)
    model.compile(
        loss=neighbourhood_likelihood_loss,
        optimizer=optimizer,
        experimental_run_tf_function=False
    )
    print(model.summary())
    return model

model_dependencies = {'neighbourhood_likelihood_loss': neighbourhood_likelihood_loss}

model = get_model(config.AI_LR, config.AI_L2_REG, config.ALGO_B_START, config.ALGO_B_LIMIT, config.ALGO_B_DELTA)
model.load_weights('./model/model.h5')

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
features (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
x_Embedding (Embedding)         (None, None, 10)     10240       features[0][0]                   
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 10)           0           x_Embedding[0][0]                
__________________________________________________________________________________________________
o1_Dense_1 (Dense)              (None, 10)           110         lambda[0][0]                     
______________________________________________________________________________________________

In [9]:
# load data and make prediction - RaggedTensor

import tensorflow as tf
import re
import random
import bz2

def get_dataset(dataset_path, shuffle=False, batch_size=512, drop_remainder=False, repeat=False, processes_per_host=-1):
    '''dedicated for YOYI dataset, generate by TextLineDataset'''
    
    def parse(element):
        # process the element (row or batch)
        # e.g. "0\t16\t123:1\t456:1\t789:1\n"
        element = tf.strings.regex_replace(element, ":1", "")
        element = tf.strings.substr(element, tf.fill(tf.shape(element), 2), tf.strings.length(element)-2)
        element = tf.strings.split(element, '\t')
        
        # generate features
        features_tensor = element[:,1:]
        features_tensor = tf.strings.to_hash_bucket_fast(features_tensor, 1024)
        
        # generate ground_truth
        win_price = tf.strings.to_number(element[:,0:1], tf.int64).to_tensor()
        # fake the bidding price
        random_num = tf.cast(
                        tf.random.normal(tf.shape(win_price), 
                                          mean=0.0, 
                                          stddev=32.0, 
                                          dtype=tf.float32),
                        tf.int64)
        label = tf.cast(
                    tf.math.greater(random_num, 
                                    tf.zeros(tf.shape(random_num), 
                                             dtype=tf.int64)), 
                    tf.int64)
        ground_truth_tensor = tf.concat([label , win_price + random_num, win_price], axis=-1) 
        
        return ({'features':features_tensor}, {'ground_truth': ground_truth_tensor})
    
    dataset = tf.data.TextLineDataset(dataset_path)
    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
    dataset = dataset.map(parse, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

## get the ground truth
test_dataset_ground_truth = []
with open('./test_dataset/test_set') as file:
    while True:
        data = file.readline()
        if not data:
            break
        win_price = int(data.split('\t')[1])
        test_dataset_ground_truth.append(win_price)
        
test_dataset = get_dataset('./test_dataset/test_set', shuffle=False, batch_size=config.AI_BATCH_SIZE, drop_remainder=False, repeat=False)
pdf = model.predict(test_dataset)

In [10]:
# calculate prediction price, index of prediction price

MAX_IDX = int((B_LIMIT - B_START) / B_DELTA - 1)
p_idx = []
p = []
for record in pdf:
    record = record.tolist()
    # calculate the index of prediction price
    idx = record.index(max(record))
    p_idx.append(idx)
    # calculate prediction price
    p.append(B_START + idx * B_DELTA)

# load ground truth of bidding price and winning price, calculate their index
z = test_dataset_ground_truth
z_idx = [min(math.floor((x - B_START) / B_DELTA), MAX_IDX) for x in z]

In [11]:
# evaluation

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score

def evaluate(pdf, p, p_idx, z, z_idx):
    r = []  # the result of prediction (win or lose)
    anlp = 0
    wr_p = []
    value = 0
    for i in range(len(pdf)):
        # number of wins
        r.append(1 if p[i]>z[i] else 0)
        # anlp
#         anlp += math.log(pdf[i][z_idx[i]])
        anlp += math.log(max(pdf[i][z_idx[i]], 1e-10))
        # c-index
        wr_p.append(sum(pdf[i][0:p_idx[i]]))
        # value
        value += z[i] * r[i]

    print('Number of wins =', sum(r), '/', len(r), ', {:.2f}%'.format(sum(r)/len(r)*100))
    mae = mean_absolute_error(p, z)
    print('MAE = {:.2f}'.format(mae))
    print('ANLP =', str(-anlp/len(pdf)))
    c_index = roc_auc_score(r, wr_p)
    print("C-Index = {:.4f}".format(c_index))
    print('Value = {:.2f}'.format(value/sum(r)))
    
evaluate(pdf, p, p_idx, z, z_idx)

Number of wins = 12728784 / 38980658 , 32.65%
MAE = 83.37
ANLP = 3.410943188889108
C-Index = 0.9374
Value = 38.54
