In [1]:
import os
import time
import gc
import re
import math
import traceback
import tensorflow as tf
import numpy as np
import shutil
import json
import sys
import pandas as pd
import multiprocessing
from tensorflow.python.keras import backend as K
from pathlib import Path
from random import shuffle
from google.cloud import storage
from google.cloud.storage import blob
from datetime import datetime, timedelta
from tensorflow.core.protobuf import config_pb2
from scipy.sparse import csr_matrix
from tensorflow.python import debug as tfdbg
from tensorflow.python.lib.io import file_io

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBEL_DEVICES'] = '0, 1'

In [2]:
HOME = Path(os.environ['HOME'])
training_path = HOME / 'ninja_project' / 'data' / 'netflix' / 'training'
src_files = [str(training_path / f) for f in os.listdir(str(training_path)) if 'n3m' in f]

tensorboard_path = Path(HOME) / 'tensorboard_files'
ckpt_dir = str(HOME / 'tensorflow_ckpt' / 'DeepRcommenderOneAD')

tensorboard_train_path = str(tensorboard_path / 'adr_train_onead')
tensorboard_test_path = str(tensorboard_path / 'adr_test_onead')
for path in [tensorboard_train_path, tensorboard_test_path, ckpt_dir]:
    if not os.path.exists(path):
        os.makedirs(path)
            
for path in [tensorboard_train_path, tensorboard_test_path, ckpt_dir]:
    if len(os.listdir(path)) > 0:
        for f in os.listdir(path):
            if os.path.isdir(os.path.join(path, f)):
                shutil.rmtree(os.path.join(path, f))
            else:
                os.remove(os.path.join(path, f))
    else:
        pass

# Load video data & Check

In [3]:
current_date = datetime.now()
bucket = "<GCP bucket>"
bucket_prefix = '<training data path>'
def check_date_in_storage(_date, bucket_name, bucket_prefix):
    
    def storage_newest_path(_bucket_name, _prefix_path):
        client = storage.Client()
        bucket = client.bucket(_bucket_name)
        return isinstance(bucket.get_blob(_prefix_path), blob.Blob)
    
    while True:
        month = datetime.strftime(_date.date(), "%m")
        day = datetime.strftime(_date.date(), "%d")
        prefix_path = os.path.join(bucket_prefix,
                                   'year={0}/month={1}/day={2}/'.format(_date.year, month, day))
        if storage_newest_path(bucket_name, prefix_path):
            break
        else:
            _date = _date - timedelta(days=1)
    return _date

get_check_date = check_date_in_storage(current_date, bucket, bucket_prefix)
get_check_date


datetime.datetime(2018, 12, 10, 0, 58, 22, 65091)

In [4]:
# Data in GCP
training_data_perfix = os.path.join('training data path/year={}/month={}/day={}/'.format(
    get_check_date.year,
    datetime.strftime(get_check_date.date(), "%m"),
    datetime.strftime(get_check_date.date(), "%d")))

validation_data_perfix = os.path.join('validation data path/year={}/month={}/day={}/'.format(
    get_check_date.year,
    datetime.strftime(get_check_date.date(), "%m"),
    datetime.strftime(get_check_date.date(), "%d")))
testing_data_perfix = os.path.join('testing data path/year={}/month={}/day={}/'.format(
    get_check_date.year,
    datetime.strftime(get_check_date.date(), "%m"),
    datetime.strftime(get_check_date.date(), "%d")))
click_testing_data_perfix = os.path.join('click testing data path/year={}/month={}/day={}/'.format(
    get_check_date.year,
    datetime.strftime(get_check_date.date(), "%m"),
    datetime.strftime(get_check_date.date(), "%d")))

training_files = tf.gfile.ListDirectory(training_data_perfix)
validation_files = tf.gfile.ListDirectory(validation_data_perfix)
testing_files = tf.gfile.ListDirectory(testing_data_perfix)
click_testing_files = tf.gfile.ListDirectory(click_testing_data_perfix)

training_list = [os.path.join(training_data_perfix, f) for f in training_files if '.csv' in f]
validation_list = [os.path.join(validation_data_perfix, f) for f in validation_files if '.csv' in f]
testing_list = [os.path.join(testing_data_perfix, f) for f in testing_files if '.csv' in f]
click_testing_list = [os.path.join(click_testing_data_perfix, f) for f in click_testing_files if '.csv' in f]


In [10]:
# get data schema
video_data_info_perfix = os.path.join('gs://onedata-prod/event/serve/adr/')
video_data_info_files = tf.gfile.ListDirectory(video_data_info_perfix)
video_data_info_list = [os.path.join(video_data_info_perfix, f) for f in video_data_info_files if '.json' in f]

with file_io.FileIO(video_data_info_list[0], mode='r') as input_f:
    data_schema = json.load(input_f)


if data_schema['CreateDate'] != get_check_date.strftime('%Y-%m-%d'):
    sys.exit(ValueError)
    

In [None]:
%%time
raw_data = pd.DataFrame()
for file in training_list:
    with file_io.FileIO(file, mode='r') as input_f:
        res = pd.read_csv(input_f)
        raw_data = pd.concat([raw_data, res], axis=0)


In [5]:
valid_data = pd.DataFrame()
for file in validation_list:
    with file_io.FileIO(file, mode='r') as input_f:
        res = pd.read_csv(input_f)
        valid_data = pd.concat([valid_data, res], axis=0)

In [6]:
test_data = pd.DataFrame()
for file in testing_list:
    with file_io.FileIO(file, mode='r') as input_f:
        res = pd.read_csv(input_f)
        test_data = pd.concat([test_data, res], axis=0)


In [7]:
valid_data.shape, test_data.shape

((474632, 85), (1110843, 85))

In [8]:
click_test_data = pd.DataFrame()
for file in click_testing_list:
    with file_io.FileIO(file, mode='r') as input_f:
        res = pd.read_csv(input_f)
        click_test_data = pd.concat([click_test_data, res], axis=0)


In [9]:
click_test_data.shape

(5454, 85)

# Build Model

In [11]:
def load_data(files_list, features_name, batch_size, epochs, is_training, name):
    def _parse_function(_line, _name):
        num_features = len(features_name)
        num_columns = num_features + 1
        record_defaults = [[]] * (num_features) + [['string']]
        decode_line = tf.decode_csv(_line, record_defaults=record_defaults, field_delim=',', name=_name)        
        #features = dict(zip(num_features, decode_line[:num_features]))
        index = tf.cast(decode_line[-1], tf.string)
        features = tf.reshape(decode_line[:num_features], shape=[num_features])
        return index, features

    def _input_fn():
        #match_pattern = tf.placeholder(dtype=tf.string)
        #filenames = tf.train.match_filenames_once(pattern=match_pattern, name=None)
        dataset = tf.data.Dataset.from_tensor_slices(files_list)
        dataset = dataset.flat_map(lambda filename: (tf.data.TextLineDataset(filename).\
                                                     skip(1).\
                                                     map(lambda line: _parse_function(line, name), 
                                                         num_parallel_calls=multiprocessing.cpu_count())))
        if is_training:
            dataset = dataset.shuffle(buffer_size=256)
            #dataset = dataset.repeat(epochs)    
        dataset = dataset.prefetch(buffer_size=batch_size * 100) # n = 元素個數 / Batch size 
        dataset = dataset.batch(batch_size)                                            
        return dataset
    return _input_fn   

In [None]:
# debug
'''
# reset graph and setting sess config
tf.reset_default_graph()
config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
config.gpu_options.allow_growth = True

with tf.Graph().as_default() as g:    
    with g.device('/cpu:0'):
        with tf.name_scope('DataPipelines'):
            train_dataset = load_data(files_list=training_list, features_name=data_schema['DataSchema'][:-1],
                                      batch_size=NUM_BATCH_SIZE, epochs=NUM_EPOCHS, is_training=1, name='train')    
            iterator = tf.data.Iterator.from_structure(train_dataset().output_types, 
                                                       train_dataset().output_shapes)    
            batch_index, batch_features = iterator.get_next()
            train_init_op = iterator.make_initializer(train_dataset())            
            
with tf.Session(graph=g, config=config) as sess:
    total_train_steps = 99999
    for epoch in range(1):        
        tic = time.time()
        sess.run(train_init_op)
        for i in range(total_train_steps):                
            try:
                train_labels, train_features = sess.run([batch_index, batch_features])                
            except tf.errors.OutOfRangeError:
                print('Finished, time:{}'.format(time.time() - tic))
                break


## DropBlock

In [131]:
# DropBlock
class DropBlock(tf.keras.layers.Layer) :
    def __init__(self, keep_prob, block_size, **kwargs):
        super(DropBlock, self).__init__(**kwargs)
        self.keep_prob = float(keep_prob) if isinstance(keep_prob, int) else keep_prob
        self.block_size = int(block_size)

    def compute_output_shape(self, input_shape):
        return input_shape
    
    def build(self, input_shape):
        _, self.h, self.w, self.channel = input_shape.as_list()
        # pad the mask
        bottom = right = (self.block_size -1) // 2
        top = left = (self.block_size -1) - bottom
        self.padding = [[0, 0], [top, bottom], [left, right], [0, 0]]
        self.set_keep_prob()
        super(DropBlock, self).build(input_shape)
        
    def set_keep_prob(self, keep_prob=None):
        """This method only support Eager Execution"""
        if keep_prob is not None:
            self.keep_prob = keep_prob
        w, h = tf.to_float(self.w), tf.to_float(self.h)
        self.gamma = (1. - self.keep_prob) * (w * h) / (self.block_size ** 2) / ((w - self.block_size + 1) * (h - self.block_size + 1))

    def _create_mask(self, input_shape):
        sampling_mask_shape = tf.stack([input_shape[0], 
                                        self.h - self.block_size + 1, 
                                        self.w - self.block_size + 1,
                                        self.channel])
        mask = DropBlock._bernoulli(sampling_mask_shape, self.gamma)
        # 擴充行列，並給予0值，依據 paddings 參數給予的上下左右值來做擴充，mode有三種模式可選，可參考 document
        mask = tf.pad(tensor=mask, paddings=self.padding, mode='CONSTANT') 
        mask = tf.nn.max_pool(value=mask, 
                              ksize=[1, self.block_size, self.block_size, 1], 
                              strides=[1, 1, 1, 1], 
                              padding='SAME')
        mask = 1 - mask
        return mask
        
    @staticmethod    
    def _bernoulli(shape, mean):
        return tf.nn.relu(tf.sign(mean - tf.random_uniform(shape, minval=0, maxval=1, dtype=tf.float32)))
    
    # The call function is a built-in function in 'tf.kera'.
    def call(self, inputs, training=None, scale=True, **kwargs):
        def drop():
            mask = self._create_mask(tf.shape(inputs))
            output = inputs * mask
            output = tf.cond(tf.constant(scale, dtype=tf.bool) if isinstance(scale, bool) else scale,
                             true_fn=lambda: output * tf.to_float(tf.size(mask)) / tf.reduce_sum(mask),
                             false_fn=lambda: output)
            return output
        
        if training is None:
            training = K.learning_phase()
        output = tf.cond(tf.logical_or(tf.logical_not(training), tf.equal(self.keep_prob, 1.0)),
                         true_fn=lambda: inputs,
                         false_fn=drop)
        return output



    

In [189]:
a = tf.placeholder(tf.float32, [None, 5, 5, 3])
keep_prob = tf.placeholder(tf.float32)
training = tf.placeholder(tf.bool)

drop_block = DropBlock(keep_prob=keep_prob, block_size=4)
b = drop_block(inputs=a, training=training)
noise_dist = tf.distributions.Bernoulli(probs=[0.5])
mask = noise_dist.sample(tf.stack(tf.shape(a)))
    
sess = tf.Session()
feed_dict = {a: np.ones([2, 5, 5, 3]), keep_prob: 0.8, training: True}
c, c_mask = sess.run([b, mask], feed_dict=feed_dict)
print(c.shape)
print(c[0, :, :, 0])
print(c_mask.reshape([2, 5, 5, 3])[0, :, :, 0])


(2, 5, 5, 3)
[[1.2711865 1.2711865 1.2711865 1.2711865 1.2711865]
 [1.2711865 1.2711865 1.2711865 1.2711865 1.2711865]
 [1.2711865 1.2711865 1.2711865 1.2711865 1.2711865]
 [1.2711865 1.2711865 1.2711865 1.2711865 1.2711865]
 [1.2711865 1.2711865 1.2711865 1.2711865 1.2711865]]
[[0 1 0 0 1]
 [0 1 0 1 1]
 [0 1 0 0 0]
 [0 1 1 0 1]
 [0 1 0 0 1]]


# Define layer function

In [12]:
def activation(x, fn, name=None):
    if fn == 'selu':
        return tf.nn.selu(x, name)
    elif fn == 'relu':
        return tf.nn.relu(x, name)
    elif fn == 'sigmoid':
        return tf.nn.sigmoid(x, name)
    elif fn == 'relu6':
        return tf.nn.relu6(x, name)
    elif fn == 'elu':
        return tf.nn.elu(x, name)
    elif fn == 'lrelu':
        return tf.nn.leaky_relu(x, 0.2, name)
    elif fn == None:
        return x
    else:
        raise ValueError('Unknown non-linearity type')

def initialize_weights(layers, is_constrained=False):
    W = {}
    b = {}
    with tf.variable_scope('hyperparameters', reuse=tf.AUTO_REUSE):
        with tf.name_scope('Weights'):
            init = tf.contrib.layers.xavier_initializer(seed=202109)
            for i in range(len(layers) - 1):
                layer_name = 'layer%s' % i
                W[i + 1] = tf.get_variable(name="W" + str(i + 1), shape=(layers[i], layers[i + 1]), initializer=init)            
                if is_constrained:
                    W[2 * len(layers) - 2 - i] = tf.transpose(W[i + 1])
                else:            
                    W[2 * len(layers) - 2 - i] = tf.get_variable(name="W" + str(2 * len(layers) - 2 - i),
                                                                 shape=(layers[i + 1], layers[i]),
                                                                 initializer=init)
                tf.summary.histogram(name=layer_name + '/Weights', values=W[i+1])
                tf.summary.histogram(name=layer_name + '/Weights', values=W[2 * len(layers) - 2 - i])
            
        with tf.name_scope('Biases'):
            for i in range(len(layers) - 1):
                layer_name = 'layer%s' % i    
                b[i + 1] = tf.get_variable(name="b" + str(i + 1),
                                           shape=(layers[i + 1], ),
                                           initializer=tf.zeros_initializer())
                b[2 * len(layers) - 2 - i] = tf.get_variable(name="b" + str(2 * len(layers) - 2 - i),
                                                             shape=(layers[i], ),
                                                             initializer=tf.zeros_initializer())
                tf.summary.histogram(name=layer_name + '/Biases', values=b[i + 1])
                tf.summary.histogram(name=layer_name + '/Biases', values=b[2 * len(layers) - 2 - i])
            
    return W, b

def encoder(x, weights, func, drop_prob):
    W, b = weights
    layers = int(len(W) / 2)
    with tf.name_scope('Encoder'):
        for layer in range(1, layers + 1):
            if isinstance(x, tf.SparseTensor):
                x = activation(tf.nn.bias_add(tf.sparse_tensor_dense_matmul(sp_a=x, b=W[layer]), b[layer]), func)
            else:
                x = activation(tf.nn.bias_add(tf.matmul(x, W[layer]), b[layer]), func)
            if drop_prob:
                x = tf.nn.dropout(x, drop_prob)
    return x

def decoder(x, weights, func, last_func):
    W, b = weights
    layers = int(len(W) / 2)
    with tf.name_scope('Decoder'):
        for layer in range(layers + 1, 2 * layers):
            x =  activation(tf.nn.bias_add(tf.matmul(x, W[layer]), b[layer]), func)
        # output layer    
        x = activation(tf.nn.bias_add(tf.matmul(x, W[len(W)]), b[len(b)]), last_func, name='Prediction')
    return x

def autoencoder(x, layers, is_constrained=False, drop_prob=1.0, func='selu', last_func='selu'):
    weights = initialize_weights(layers, is_constrained)    
    forward = encoder(x, weights, func, drop_prob)
    forward = decoder(forward, weights, func, last_func)
    return forward, weights

def get_loss(y, y_hat):
    """
    use Masked Mean Squared Error Loss
    """
    zero = tf.constant(0, dtype=tf.float32)
    mask = tf.not_equal(y, zero)
    mask = tf.cast(mask, tf.float32)
    mse_loss = tf.reduce_sum(tf.multiply(tf.square(y - y_hat), mask))
    num_mask = tf.reduce_sum(mask)        
    return mse_loss, tf.divide(mse_loss, num_mask)

def get_test_loss(y, y_hat):
    zero = tf.constant(0, dtype=tf.float32)
    mask = tf.not_equal(y, zero)
    mask = tf.cast(mask, tf.float32)
    y_hatm = tf.multiply(y_hat, mask)
    loss = tf.reduce_sum(tf.square(y_hatm - y))
    mask = tf.reduce_sum(mask)
    return loss, mask

def get_optimizer(optimizer_fn, lr, momentum_value):
    if optimizer_fn == 'momentum':
        return tf.train.MomentumOptimizer(lr, momentum_value)
    elif optimizer_fn =='adam':
        return tf.train.AdamOptimizer(lr)
    else:
        return tf.train.GradientDescentOptimizer(lr)
    
    

# Parameters setting

In [13]:
checkpoint_steps = 1    
dense_refeeding = True
NUM_EPOCHS = 1
NUM_BATCH_SIZE = 128
N_ITEMS = len(data_schema['DataSchema'][:-1])
#N_USERS = len(user_id_map)
LAYERS_SIZE = [N_ITEMS, 128, 256, 256]
START_LEARNING_RATE = 0.001
MOMENTUM_VALUE = 0.9
DROPOUT_RATE = 0.65

# Train and Saving model use SavedModel module  

In [14]:
# reset graph and setting sess config
tf.reset_default_graph()
config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
config.gpu_options.allow_growth = True
with tf.Graph().as_default() as g:    
    tf.set_random_seed(202109)
    with g.device('/cpu:0'):
        with tf.name_scope('DataPipelines'):
            train_dataset = load_data(files_list=training_list, features_name=data_schema['DataSchema'][:-1], 
                                      batch_size=NUM_BATCH_SIZE, epochs=NUM_EPOCHS, is_training=1, name='train')    
            valid_dataset = load_data(files_list=validation_list, features_name=data_schema['DataSchema'][:-1], 
                                      batch_size=NUM_BATCH_SIZE, epochs=NUM_EPOCHS, is_training=0, name='valid') 
            iterator = tf.data.Iterator.from_structure(train_dataset().output_types, 
                                                       train_dataset().output_shapes)    
            batch_index, batch_features = iterator.get_next()
            train_init_op = iterator.make_initializer(train_dataset())
            valid_init_op = iterator.make_initializer(valid_dataset())
            
    with g.device('/gpu:0'):        
        with tf.name_scope('Placeholder'):
            # X = tf.sparse_placeholder(tf.float32, [None, N_ITEMS], name='X')
            X = tf.placeholder(tf.float32, [None, N_ITEMS], name='Inputs')
            Y = tf.placeholder(tf.float32, [None, N_ITEMS], name='Targets')
            global_step = tf.Variable(tf.constant(0), trainable=False)
    
        with tf.name_scope('Autoencoder'):
            Y_hat, weights_dict = autoencoder(x=X, layers=LAYERS_SIZE, is_constrained=False, 
                                              drop_prob=0.5, func='selu', last_func='selu')
        
        with tf.name_scope('Loss'):
            loss, mmse_loss = get_loss(y=Y, y_hat=Y_hat)
            loss_sum, mask_sum = get_test_loss(y=Y, y_hat=Y_hat)
            tf.summary.scalar(name='MSELoss', tensor=loss)
            tf.summary.scalar(name='MMSELoss', tensor=mmse_loss)
            #tf.summary.scalar(name='LossSum', tensor=loss_sum)
            #tf.summary.scalar(name='MaskSum', tensor=mask_sum)
            
    with g.device('/gpu:1'):        
        with tf.name_scope('Optimizer'):
            #learning_rate = tf.train.exponential_decay(START_LEARNING_RATE, global_step,
            #                                           1000, 0.96, staircase=False)
            optimizer = get_optimizer('momentum', START_LEARNING_RATE, MOMENTUM_VALUE)
            train_step = optimizer.minimize(mmse_loss, global_step=global_step)
    #saver = tf.train.Saver()  
    

with tf.Session(graph=g, config=config) as sess:
    
    export_dir = os.path.join(ckpt_dir, 'export')        
    total_train_steps = 99999
    total_valid_steps = 99999
    merged_summary = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(tensorboard_train_path, graph=sess.graph, filename_suffix='DeepRecommender')
    test_writer = tf.summary.FileWriter(tensorboard_test_path, filename_suffix='DeepRecommender')

    init = tf.global_variables_initializer()
    sess.run(init)
    
    # Best validation loss seen so far.
    best_valid_loss = 99999
        
    # Iteration-number for last improvement to validation loss.
    last_improvement = 0

    # Stop optimization if no improvement found in this many iterations.
    require_improvement = 5
    
    print('Start training...')
    for epoch in range(NUM_EPOCHS):
        print('=== Epoch: {:2d} ==='.format(epoch))
        tic = time.time()
        total_loss_sum = 0
        total_mask_sum = 0
        valid_total_loss_sum = 0
        valid_total_mask_sum = 0  
        train_loss_list = []
        train_mmse_loss_list = []
        valid_loss_list = []
        valid_mmse_loss_list = []
        
        sess.run(train_init_op)
        for i in range(total_train_steps):                
            try:
                train_labels, train_features = sess.run([batch_index, batch_features])                
                _, train_result, train_loss, train_mmse_loss, train_loss_sum, train_mask_sum = sess.run([train_step, Y_hat, loss, mmse_loss, loss_sum, mask_sum], 
                                                                                                        feed_dict={X: train_features, Y: train_features},
                                                                                                        options=config_pb2.RunOptions(report_tensor_allocations_upon_oom=True))            
                if dense_refeeding:
                    _, train_loss, train_mmse_loss, train_loss_sum, train_mask_sum = sess.run([train_step, loss, mmse_loss, loss_sum, mask_sum], 
                                                                                              feed_dict={X: train_result, Y: train_result})                          
                total_loss_sum += train_loss_sum
                total_mask_sum += train_mask_sum
                train_loss_list.append(train_loss)
                train_mmse_loss_list.append(train_mmse_loss) 
                if np.isnan(train_loss):
                    print('Loss Nan error')
                    break
                if i % 1000 == 0 and i > 0:
                    print('Iter: {:4d}, RMSE: {:8f}'.format(i, np.sqrt(total_loss_sum/total_mask_sum)))
            except tf.errors.OutOfRangeError:
                print('Iter: {:4d} is end of line.'.format(i))
                break
        
        trainingRMSE = np.sqrt(total_loss_sum/total_mask_sum)                 
        summary = sess.run(merged_summary, feed_dict={X: train_features, Y: train_features}) 
        train_writer.add_summary(summary, epoch) # epoch * total_train_steps + i        
        
        sess.run(valid_init_op)
        for _ in range(total_valid_steps):
            try:
                valid_labels, valid_features = sess.run([batch_index, batch_features])
                vaild_loss, valid_mmse_loss, valid_loss_sum, valid_mask_sum = sess.run([loss, mmse_loss, loss_sum, mask_sum], 
                                                                                       feed_dict={X: valid_features, Y: valid_features})
                valid_total_loss_sum += valid_loss_sum
                valid_total_mask_sum += valid_mask_sum
                valid_loss_list.append(vaild_loss)
                valid_mmse_loss_list.append(valid_mmse_loss)
                if np.isnan(vaild_loss):
                    break
            except tf.errors.OutOfRangeError:
                break  
                
        validRMSE = np.sqrt(valid_total_loss_sum/valid_total_mask_sum)
        valid_summary = sess.run(merged_summary, feed_dict={X: valid_features, Y: valid_features})            
        test_writer.add_summary(valid_summary, epoch)         
        #print("Epoch: {:>2d}, Train Loss: {:>2.8f}, Valid Loss:{:>2.8f}".format(epoch, np.mean(train_mmse_loss_list), np.mean(valid_mmse_loss_list)))    
        print("Epoch: {:>2d}, Train Loss: {:>2.8f}, Valid Loss:{:>2.8f}, Compute time: {:>2.8f}".format(epoch, trainingRMSE, validRMSE, time.time() - tic))    
        
        # Early Stopping
        if validRMSE < best_valid_loss:
            # Update the best-known validation accuracy.
            best_valid_loss = valid_mmse_loss
            
            # Set the iteration for the last improvement to current.
            last_improvement = epoch

            # Save all variables of the TensorFlow graph to file.
            # Build the tensor info             
            #graph = tf.get_default_graph()
            inputs = g.get_tensor_by_name('Placeholder/Inputs:0')
            prediction = g.get_tensor_by_name('Autoencoder/Decoder/Prediction:0')
            model_input = tf.saved_model.utils.build_tensor_info(inputs)
            model_output = tf.saved_model.utils.build_tensor_info(prediction)
            # Build signature definition
            signature = tf.saved_model.signature_def_utils.build_signature_def(
                inputs={'inputs': model_input},
                outputs={'outputs': model_output}, 
                method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
            
            if os.path.exists(export_dir):
                shutil.rmtree(export_dir, ignore_errors=True)
            builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
            builder.add_meta_graph_and_variables(
                sess, [tf.saved_model.tag_constants.SERVING], 
                signature_def_map={
                    tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature
                }, strip_default_attrs=True)
            builder.save()
            # Use saver to save graph, variables, assets
            #saver.save(sess, os.path.join(ckpt_dir, 'DeepRecommender.ckpt'), 
            #           global_step=epoch) # epoch * total_train_steps + i

            # A string to be printed below, shows improvement found.
            improved_str = '*'
        else:
            # An empty string to be printed below. Shows that no improvement was found.
            improved_str = ''
            
            # Status-message for printing.
            #msg = "Epoch: {0:>6}, Train Loss: {1:>6.5%}, Validation Loss: {2:>6.5%} {3}"                    
            #print(msg.format(epoch, train_mmse_loss, valid_mmse_loss, improved_str))

            # If no improvement found in the required number of iterations.
            if epoch - last_improvement > require_improvement:
                print("No improvement found in a while, stopping optimization.")                
                break
                            
    train_writer.close()  
    test_writer.close()


Start training...
=== Epoch:  0 ===
Iter: 1000, RMSE: 1.440645
Iter: 2000, RMSE: 1.242693
Iter: 3000, RMSE: 1.123144
Iter: 4000, RMSE: 1.043585
Iter: 5000, RMSE: 0.984786
Iter: 6000, RMSE: 0.938379
Iter: 7000, RMSE: 0.900580
Iter: 8000, RMSE: 0.868793
Iter: 8862 is end of line.
Epoch:  0, Train Loss: 0.84492158, Valid Loss:2.07888396, Compute time: 260.99773622
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b'/home/yuyuliao/tensorflow_ckpt/DeepRcommenderOneAD/export/saved_model.pb'


# Prediction and Accuracy

### Use Saver module to prediction

In [None]:
from tensorflow.python.tools import inspect_checkpoint as chkp
save_path = '/home/yuyuliao/tensorflow_ckpt/DeepRcommenderOneAD/'
checkpoint_file = tf.train.latest_checkpoint(save_path)
chkp.print_tensors_in_checkpoint_file(checkpoint_file, tensor_name='', all_tensors=True)


In [None]:
chkp.print_tensors_in_checkpoint_file(checkpoint_file, tensor_name='hyperparameters/W1', all_tensors=False)

In [None]:
tf.reset_default_graph()
config = tf.ConfigProto(allow_soft_placement = True)
checkpoint_file = tf.train.latest_checkpoint(save_path)
#with tf.Session(config=config) as pred_sess:
pred_sess = tf.Session(config=config)
imported_meta = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
imported_meta.restore(pred_sess, tf.train.latest_checkpoint(save_path))

graph = tf.get_default_graph()
graph_nodes_name = [n.name for n in graph.as_graph_def().node]

In [None]:
graph.get_operation_by_name("Placeholder/Inputs"), graph.get_operation_by_name('Autoencoder/Decoder/prediction')

In [None]:
X = graph.get_tensor_by_name('Placeholder/Inputs:0')
feed_dict = {X: valid_features}
output = graph.get_tensor_by_name("Autoencoder/Decoder/Prediction:0")
#init = tf.global_variables_initializer()
#with tf.Session(graph=graph, config=config) as test_sess: 
#test_sess.run(init)
res = pred_sess.run(output, feed_dict=feed_dict)
pred_sess.close()

## Use SavedModel module to prediction

In [21]:
tensorboard_path = Path(HOME) / 'tensorboard_files'
ckpt_dir = str(HOME / 'tensorflow_ckpt' / 'DeepRcommenderOneAD')
export_dir = os.path.join(ckpt_dir, 'export')
export_dir

'/home/yuyuliao/tensorflow_ckpt/DeepRcommenderOneAD/export'

In [22]:
%%time
tf.reset_default_graph()
config = tf.ConfigProto(allow_soft_placement = True)

with tf.Graph().as_default() as g: 
    test_dataset = load_data(files_list=validation_list, features_name=data_schema['DataSchema'][:-1], 
                              batch_size=NUM_BATCH_SIZE, epochs=NUM_EPOCHS, is_training=0, name='test') 
    iterator = tf.data.Iterator.from_structure(test_dataset().output_types, 
                                               test_dataset().output_shapes)    
    testing_batch_index, testing_batch_features = iterator.get_next()
    test_init_op = iterator.make_initializer(test_dataset())    

    
with tf.Session(config=config, graph=g) as pred_sess:
    total_valid_steps = 99999
    meta_graph_def = tf.saved_model.loader.load(pred_sess, [tf.saved_model.tag_constants.SERVING], export_dir)

    signature = meta_graph_def.signature_def
    x_tensor_name = signature[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].inputs['inputs'].name
    y_tensor_name = signature[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs['outputs'].name

    #graph = tf.get_default_graph()
    #graph_nodes_name = [n.name for n in graph.as_graph_def().node]
    pred_sess.run(test_init_op)
    prediction_recoder = []
    X = pred_sess.graph.get_tensor_by_name(x_tensor_name)
    Y = pred_sess.graph.get_tensor_by_name(y_tensor_name)
    for _ in range(total_valid_steps):
        try:
            testing_labels, testing_features = pred_sess.run([testing_batch_index, testing_batch_features])
            decode_valid_labels = [l.decode('utf-8') for l in testing_labels]            
            predicted_result = pred_sess.run(Y, feed_dict={X: testing_features})                         
            prediction_recoder.append(list(zip(decode_valid_labels, np.round(predicted_result, 6))))
        except tf.errors.OutOfRangeError:
            break

print('Num of iteration:{}, total predicted count: {}'.format(len(prediction_recoder), 
                                                              len(prediction_recoder) * NUM_BATCH_SIZE))            

INFO:tensorflow:Restoring parameters from b'/home/yuyuliao/tensorflow_ckpt/DeepRcommenderOneAD/export/variables/variables'
Num of iteration:3709, total predicted count: 474752
CPU times: user 7min 16s, sys: 12.5 s, total: 7min 28s
Wall time: 1min 4s
