In [1]:
import os
import time
import gc
import re
import math
import traceback
import tensorflow as tf
import numpy as np
import shutil
import json
import sys
import pandas as pd
import multiprocessing
from pathlib import Path
from random import shuffle
from google.cloud import storage
from google.cloud.storage import blob
from datetime import datetime, timedelta
from tensorflow.core.protobuf import config_pb2
from scipy.sparse import csr_matrix
from tensorflow.python import debug as tfdbg
from tensorflow.python.lib.io import file_io

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBEL_DEVICES'] = '0, 1'


In [2]:
HOME = Path(os.environ['HOME'])
tensorboard_path = Path(HOME) / 'tensorboard_files'
ckpt_dir = str(HOME / 'tensorflow_ckpt' / 'CDAE')

tensorboard_train_path = str(tensorboard_path / 'cdae_train')
tensorboard_test_path = str(tensorboard_path / 'cdae_test')
for path in [tensorboard_train_path, tensorboard_test_path, ckpt_dir]:
    if not os.path.exists(path):
        os.makedirs(path)
            
for path in [tensorboard_train_path, tensorboard_test_path, ckpt_dir]:
    if len(os.listdir(path)) > 0:
        for f in os.listdir(path):
            if os.path.isdir(os.path.join(path, f)):
                shutil.rmtree(os.path.join(path, f))
            else:
                os.remove(os.path.join(path, f))
    else:
        pass


# Load video data & Check

In [3]:
current_date = datetime.now()
bucket = "<GCP bucket>"
bucket_prefix = '<training data path>'
def check_date_in_storage(_date, bucket_name, bucket_prefix):
    
    def storage_newest_path(_bucket_name, _prefix_path):
        client = storage.Client()
        bucket = client.bucket(_bucket_name)
        return isinstance(bucket.get_blob(_prefix_path), blob.Blob)
    
    while True:
        month = datetime.strftime(_date.date(), "%m")
        day = datetime.strftime(_date.date(), "%d")
        prefix_path = os.path.join(bucket_prefix,
                                   'year={0}/month={1}/day={2}/'.format(_date.year, month, day))
        if storage_newest_path(bucket_name, prefix_path):
            break
        else:
            _date = _date - timedelta(days=1)
    return _date

get_check_date = check_date_in_storage(current_date, bucket, bucket_prefix)
get_check_date


datetime.datetime(2018, 12, 10, 0, 46, 44, 957163)

In [4]:
# Data in GCP
training_data_perfix = os.path.join('training data path/year={}/month={}/day={}/'.format(
    get_check_date.year,
    datetime.strftime(get_check_date.date(), "%m"),
    datetime.strftime(get_check_date.date(), "%d")))

validation_data_perfix = os.path.join('validation data path/year={}/month={}/day={}/'.format(
    get_check_date.year,
    datetime.strftime(get_check_date.date(), "%m"),
    datetime.strftime(get_check_date.date(), "%d")))
testing_data_perfix = os.path.join('testing data path/year={}/month={}/day={}/'.format(
    get_check_date.year,
    datetime.strftime(get_check_date.date(), "%m"),
    datetime.strftime(get_check_date.date(), "%d")))
click_testing_data_perfix = os.path.join('click testing data path/year={}/month={}/day={}/'.format(
    get_check_date.year,
    datetime.strftime(get_check_date.date(), "%m"),
    datetime.strftime(get_check_date.date(), "%d")))

training_files = tf.gfile.ListDirectory(training_data_perfix)
validation_files = tf.gfile.ListDirectory(validation_data_perfix)
testing_files = tf.gfile.ListDirectory(testing_data_perfix)
click_testing_files = tf.gfile.ListDirectory(click_testing_data_perfix)

training_list = [os.path.join(training_data_perfix, f) for f in training_files if '.csv' in f]
validation_list = [os.path.join(validation_data_perfix, f) for f in validation_files if '.csv' in f]
testing_list = [os.path.join(testing_data_perfix, f) for f in testing_files if '.csv' in f]
click_testing_list = [os.path.join(click_testing_data_perfix, f) for f in click_testing_files if '.csv' in f]


In [5]:
# get data schema
video_data_info_perfix = os.path.join('<main path>')
video_data_info_files = tf.gfile.ListDirectory(video_data_info_perfix)
video_data_info_list = [os.path.join(video_data_info_perfix, f) for f in video_data_info_files if '.json' in f]

with file_io.FileIO(video_data_info_list[1], mode='r') as input_f:
    data_schema = json.load(input_f)

if data_schema['CreateDate'] != get_check_date.strftime('%Y-%m-%d'):
    sys.exit(ValueError)


In [6]:
raw_data = pd.DataFrame()
for file in training_list:
    with file_io.FileIO(file, mode='r') as input_f:
        res = pd.read_csv(input_f)
        raw_data = pd.concat([raw_data, res], axis=0)


In [7]:
valid_data = pd.DataFrame()
for file in validation_list:
    with file_io.FileIO(file, mode='r') as input_f:
        res = pd.read_csv(input_f)
        valid_data = pd.concat([valid_data, res], axis=0)

In [8]:
test_data = pd.DataFrame()
for file in testing_list:
    with file_io.FileIO(file, mode='r') as input_f:
        res = pd.read_csv(input_f)
        test_data = pd.concat([test_data, res], axis=0)


In [9]:
valid_data.shape, test_data.shape

((474632, 85), (1110843, 85))

In [10]:
click_test_data = pd.DataFrame()
for file in click_testing_list:
    with file_io.FileIO(file, mode='r') as input_f:
        res = pd.read_csv(input_f)
        click_test_data = pd.concat([click_test_data, res], axis=0)


In [11]:
click_test_data.shape

(5454, 85)

# Load netflix data

In [None]:
training_path = HOME / 'ninja_project' / 'data' / 'netflix' / 'training'
src_files = [str(training_path / f) for f in os.listdir(str(training_path)) if 'n3m' in f]

In [None]:
%%time
def _build_maps(src_files):
    _user_id_map = dict()
    _item_id_map = dict()

    #src_files = [path.join(self._data_dir, f)
    #             for f in listdir(self._data_dir)
    #             if path.isfile(path.join(self._data_dir, f)) and f.endswith(self._extension)]

    u_id = 0
    i_id = 0
    u_index = 0
    i_index = 1
    r_index = 2
    for source_file in src_files:
        with open(source_file, 'r') as src:
            for line in src.readlines():
                parts = line.strip().split('\t')
                if len(parts)<3:
                    raise ValueError('Encountered badly formatted line in {}'.format(source_file))
                u_id_orig = int(parts[u_index])
                if u_id_orig not in _user_id_map:
                    _user_id_map[u_id_orig] = u_id
                    u_id += 1

                i_id_orig = int(parts[i_index])
                if i_id_orig not in _item_id_map:
                    _item_id_map[i_id_orig] = i_id
                    i_id += 1
    return _user_id_map, _item_id_map

user_id_map, item_id_map = _build_maps(src_files)

In [None]:
%%time
def _build_user_items_dict(src_file):
    all_data = dict()
    #for source_file in src_files:
    with open(src_file, 'r') as f: 
        for line in f.readlines():
            parts = line.strip().split('\t')
            if len(parts) < 3:
                raise ValueError('Encountered badly formatted line in {}'.format(train_sample))
            key = user_id_map[int(parts[0])] # user_index value
            value = item_id_map[int(parts[1])] # item_index value
            rating = float(parts[2])
            if key not in all_data:
                all_data[key] = []
            all_data[key].append((value, rating))
    return all_data

train_data = _build_user_items_dict('/home/yuyuliao/ninja_project/data/netflix/training/n3m.train.txt')
valid_data = _build_user_items_dict('/home/yuyuliao/ninja_project/data/netflix/training/n3m.valid.txt')
test_data = _build_user_items_dict('/home/yuyuliao/ninja_project/data/netflix/training/n3m.test.txt')

In [None]:
def iterate_one_epoch(data, batch_size, _num_items):
    keys = list(data.keys())
    shuffle(keys)
    s_ind = 0
    e_ind = batch_size
    mini_batch = []
    mini_batch_index = []    
    while e_ind < len(keys): # will have few sample droped 
        local_ind = 0 # track the local minibatch index
        ind_users = [] # contain the indices of the minibatch
        ind_users_remaind = []
        ind_items = [] # contain the minor index of the input data (second column of the input data)
        vals = [] # contain the ratings
        indices_list = [] # create indices list of list

        for ind in range(s_ind, e_ind):    
            ind_items += [v[0] for v in data[keys[ind]]] # items index
            ind_users += [local_ind] * len([v[0] for v in data[keys[ind]]]) # 這個人評論過 n 部電影
            ind_users_remaind += [keys[ind]]
            vals += [v[1] for v in data[keys[ind]]]
            local_ind += 1

        #for line in zip(ind_user, ind_items):
        #    indices_list.append(list(line))

        s_ind += batch_size
        e_ind += batch_size
        mini_batch = {'index': ind_users_remaind,
                      'data': csr_matrix((vals, (ind_users, ind_items)), shape=[batch_size, _num_items]).toarray()}
        yield mini_batch

        #mini_batch = {'user_index': ind_user, 'item_index': ind_items, 
        #              'indice': indices_list, 'value': vals, 'batch_size': e_ind-s_ind}
        #mini_batch = {'user_index': ind_user, 'item_index': ind_items,
        #              'sparse_tensor': tf.SparseTensorValue(indices=indices_list, values=vals, dense_shape=[batch_size, _num_items])}

In [None]:
def iterate_one_epoch_valid(data, batch_size, _num_items):
    keys = list(data.keys())
    weight = math.floor(len(keys) *0.50)
    shuffle(keys[0:weight])
    s_ind = 0
    e_ind = batch_size
    while e_ind < len(keys): # will have few sample droped 
        local_ind = 0 # track the local minibatch index
        ind_user = [] # contain the indices of the minibatch
        ind_items = [] # contain the minor index of the input data (second column of the input data)
        vals = [] # contain the ratings
        indices_list = [] # create indices list of list

        for ind in range(s_ind, e_ind):    
            ind_items += [v[0] for v in data[keys[ind]]] # items index
            ind_user += [local_ind] * len([v[0] for v in data[keys[ind]]]) # 這個人評論過 n 部電影
            vals += [v[1] for v in data[keys[ind]]]
            local_ind += 1
        #for line in zip(ind_user, ind_items):
        #    indices_list.append(list(line))

        s_ind += batch_size
        e_ind += batch_size
    
        #mini_batch = {'user_index': ind_user, 'item_index': ind_items, 
        #              'indice': indices_list, 'value': vals, 'batch_size': e_ind-s_ind}
        #mini_batch = {'user_index': ind_user, 'item_index': ind_items,
        #              'sparse_tensor': tf.SparseTensorValue(indices=indices_list, values=vals, dense_shape=[batch_size, _num_items])}
        mini_batch = csr_matrix((vals, (ind_user, ind_items)), shape=[batch_size, _num_items]).toarray()
        yield mini_batch

# Load sparse matrix of video id data

In [None]:
from tensorflow.python.lib.io import file_io
def load_data_from_gcs(_raw_data_path):
    data_prefix = os.path.join(_raw_data_path)
    file_list = tf.gfile.ListDirectory(data_prefix)
    raw_data = []
    try:
        json_files = [os.path.join(data_prefix, f) for f in file_list if '.csv' in f]
        for file in json_files:
            with file_io.FileIO(file, mode='rb') as input_f:
                for line in input_f:
                    d = json.loads(line.decode('utf-8'))
                    raw_data.append(d)
    except:
        print('[ERROR] Get wrong file\'s format, please check.')
        exit(1)
    return pd.DataFrame(raw_data)

In [None]:
data_prefix = os.path.join('gs://onedata-test/event/serve/adr/data_training/year=2018/month=09/day=11')
file_list = tf.gfile.ListDirectory(data_prefix)
csv_files = [os.path.join(data_prefix, f) for f in file_list if '.csv' in f]

In [None]:
raw_data = pd.DataFrame()
for file in csv_files:
    with file_io.FileIO(file, mode='r') as input_f:
        res = pd.read_csv(input_f)
        raw_data = pd.concat([raw_data, res], axis=0)



##  Use tensorflow data api - QueueRunners moudel

In [None]:
from tensorflow.python.lib.io import file_io
data_prefix = os.path.join('<training data path>')
file_list = tf.gfile.ListDirectory(data_prefix)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(file_list)

In [None]:
batch_size = 5
min_after_dequeue = 100
capacity = min_after_dequeue + 3 * batch_size

tf.reset_default_graph()
pattern = os.path.join(data_prefix, '*.csv')
filenames = tf.train.match_filenames_once(pattern, name=None)
filename_queue = tf.train.string_input_producer(filenames, shuffle=True, seed=202109)
reader = tf.TextLineReader(skip_header_lines=1)
key, value = reader.read(filename_queue)

record_defaults = [tf.constant([], dtype=tf.string)] + [tf.constant([0], dtype=tf.int32) for _ in range(47)]
#record_defaults = [[""]] + [[0]]*47
decode_line = tf.decode_csv(value, record_defaults=record_defaults, field_delim=',')
uuid = tf.cast(decode_line[0], tf.string)
reocrds = tf.reshape(decode_line[1:], shape=[-1, 47])

#mb = tf.train.shuffle_batch(decode_line, batch_size=batch_size, num_threads=16, min_after_dequeue=min_after_dequeue, capacity=capacity)
mb = tf.train.batch([uuid, reocrds],  batch_size=batch_size, num_threads=64, capacity=capacity)
init = (tf.global_variables_initializer(), tf.local_variables_initializer())

with tf.Session() as sess:
    sess.run(init)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(5):
        #data = sess.run([decode_line, mb])
        mini_batch = sess.run(mb)        
    coord.request_stop()
    coord.join(threads)

# Use tensorflow data api - dataset moudel

In [12]:
def load_data(files_list, features_name, batch_size, epochs, is_training, name):
    def _parse_function(_line, _name):
        num_features = len(features_name)
        num_columns = num_features + 1
        record_defaults = [[]] * (num_features) + [['string']]
        decode_line = tf.decode_csv(_line, record_defaults=record_defaults, field_delim=',', name=_name)        
        #features = dict(zip(num_features, decode_line[:num_features]))
        index = tf.cast(decode_line[-1], tf.string)
        features = tf.reshape(decode_line[:num_features], shape=[num_features])
        mask = tf.cast(tf.where(tf.greater(features, 0), 
                                x=features, y=tf.zeros_like(features, dtype=tf.float32)), dtype=tf.bool)        
        return index, features, mask

    def _input_fn():
        #match_pattern = tf.placeholder(dtype=tf.string)
        #filenames = tf.train.match_filenames_once(pattern=match_pattern, name=None)
        dataset = tf.data.Dataset.from_tensor_slices(files_list)
        dataset = dataset.flat_map(lambda filename: (tf.data.TextLineDataset(filename).\
                                                     skip(1).\
                                                     map(lambda line: _parse_function(line, name), 
                                                         num_parallel_calls=multiprocessing.cpu_count())))
        if is_training:
            dataset = dataset.shuffle(buffer_size=256)
            #dataset = dataset.repeat(epochs)    
        dataset = dataset.prefetch(buffer_size=batch_size * 100) # n = 元素個數 / Batch size 
        dataset = dataset.batch(batch_size)                                            
        return dataset
    return _input_fn   

In [None]:
# debug
'''
tf.reset_default_graph()
config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
config.gpu_options.allow_growth = True

with tf.Graph().as_default() as g:    
    with g.device('/cpu:0'):
        with tf.name_scope('DataPipelines'):
            train_dataset = load_data(files_list=training_list, features_name=data_schema['DataSchema'][:-1], 
                                      batch_size=NUM_BATCH_SIZE, epochs=NUM_EPOCHS, is_training=1, name='train')    
            iterator = tf.data.Iterator.from_structure(train_dataset().output_types, 
                                                       train_dataset().output_shapes)    
            batch_index, batch_features, batch_mask = iterator.get_next()
            train_init_op = iterator.make_initializer(train_dataset())            
            
with tf.Session(graph=g, config=config) as sess:
    total_train_steps = 99999
    for epoch in range(1):        
        user_v_index_dict = {}
        tic = time.time()        
        sess.run(train_init_op)
        for i in range(total_train_steps):                
            try:
                train_labels, train_features, train_mask = sess.run([batch_index, batch_features, batch_mask])  
                for k, v in enumerate(train_labels):
                    user_v_index_dict[v] = (NUM_BATCH_SIZE * i) + k

            except tf.errors.OutOfRangeError:
                print('Finished, time:{}'.format(time.time() - tic))
                break

# Paraments setting

In [13]:
N_ITEMS = len(data_schema['DataSchema'][:-1])

# raw_data.shape[0], 不使用 user 的總數是因為這樣還需要預載 user data，用 user 是否看過該影片來當作 user information
N_USERS = len(data_schema['DataSchema'][:-1]) 

# Parameters setting
NUM_EPOCHS = 1
NUM_BATCH_SIZE = 128
CORRUPTION_LEVEL = 0.333
DROPOUT_PROB = 0.5
N_HIDDEN = 50
LAYERS = [N_ITEMS, N_HIDDEN, N_ITEMS]
        
# AdaGrad Parameters
ADA_BETA = 1.0
START_LEARNING_RATE = 0.00001

# Regularization Parameter
REG_LAMBDA = 0.001

gradient_clip = 1.25

# Standard representations of weight version

In [14]:
def xavier_init(node_in, node_out, constant=1):
    low = -constant * np.sqrt( 6 / (node_in + node_out))
    high = constant * np.sqrt( 6 / (node_in + node_out))
    return tf.random_uniform((node_in, node_out), minval=low, maxval=high, dtype=tf.float32)

def activation(x, fn, name=None):
    if fn == 'selu':
        return tf.nn.selu(x, name)
    elif fn == 'relu':
        return tf.nn.relu(x, name)
    elif fn == 'sigmoid':
        return tf.nn.sigmoid(x, name)
    elif fn == 'relu6':
        return tf.nn.relu6(x, name)
    elif fn == 'elu':
        return tf.nn.elu(x, name)
    elif fn == 'lrelu':
        return tf.nn.leaky_relu(x, 0.2, name)
    elif fn == None:
        return x
    else:
        raise ValueError('Unknown non-linearity type')
        
def initialize_weights_v1(n_users, n_items, n_hidden):
    start_time = time.time()
    print('Initial weights...')
    all_weights = dict()
    W = xavier_init(n_items, n_hidden)
    W_prime = xavier_init(n_items, n_hidden)
    with tf.name_scope('Weights'):
        all_weights['W'] = [tf.Variable(W[i]) for i in range(n_items)]
        all_weights['W_prime'] = [tf.Variable(W_prime[i]) for i in range(n_items)]
        tf.summary.histogram(name='W', values=all_weights['W'])
        tf.summary.histogram(name='W_prime', values=all_weights['W_prime'])
        
    with tf.name_scope('Biases'):
        all_weights['b'] = tf.Variable(tf.zeros([1, n_hidden], dtype=tf.float32))
        all_weights['b_prime'] = [tf.Variable(tf.constant(0, dtype=tf.float32), dtype=tf.float32) for _ in range(n_items)]
        tf.summary.histogram(name='b', values=all_weights['b'])
        tf.summary.histogram(name='b_prime', values=all_weights['b_prime'])
            
    with tf.name_scope('UserSpecific'):  
        all_weights['V'] = [tf.Variable(tf.zeros([n_hidden])) for _ in range(n_users)]
        tf.summary.histogram(name='V', values=all_weights['V'])
        #all_weights['V'] = tf.get_variable(name='V', 
        #                                   initializer=tf.truncated_normal(shape=[n_users, n_hidden], 
        #                                                                   mean=0, stddev=0.01),
        #                                   dtype=tf.float32)        
        #all_weights['V'] = tf.Variable(name='V', initial_value=W, expected_shape=(N_USERS, N_HIDDEN))
        
    print('Initial weights done! Processing time:{}'.format(time.time() - start_time))
    return all_weights        
    
def initialize_weights_v2(layers, n_users, n_hidden):
    weights = {}
    with tf.variable_scope('hyperparameters', reuse=tf.AUTO_REUSE):
        with tf.name_scope('Weights'):
            xavier_init = tf.contrib.layers.xavier_initializer(seed=202109)
            for i in range(len(layers) - 1):
                layer_name = 'layer%s' % i
                weights['W' + str(i + 1)] = tf.get_variable(name="W" + str(i + 1), 
                                                            shape=(layers[i], layers[i + 1]), 
                                                            initializer=xavier_init)                
        with tf.name_scope('Biases'):        
            for i in range(len(layers) - 1):
                layer_name = 'layer%s' % i    
                weights['b'+ str(i + 1)] = tf.get_variable(name="b" + str(i + 1), 
                                                           shape=(layers[i + 1], ), 
                                                           initializer=tf.zeros_initializer())
        # User Specific 用 embedding lookup 的方式來取代
        with tf.name_scope('UserSpecific'):
            #weights['V'] = tf.get_variable(name='V', shape=(n_users, n_hidden), initializer=xavier_init)
            weights['V'] = tf.get_variable(name="V", initializer=tf.truncated_normal(shape=[n_users, n_hidden], 
                                                                                     mean=0, stddev=0.03), 
                                           dtype=tf.float32)
            #weights['V'] = [tf.Variable(tf.zeros([n_hidden])) for _ in range(n_users)]
    return weights

def initialize_weights_v3(layers_structure, n_users, n_hidden):
    weights = dict()
    xavier_init = tf.contrib.layers.xavier_initializer(seed=202109)
    with tf.variable_scope('hyperparameters', reuse=tf.AUTO_REUSE):
        for i in range(len(layers_structure) - 1):
            layer_name = 'layer%s' % i
            with tf.name_scope('Weights'):        
                weights['W' + str(i + 1)] = tf.get_variable(name=("W" + str(i)), 
                                                            #shape=(layers_structure[i], layers_structure[i + 1]),
                                                            #initializer=xavier_init, 
                                                            initializer=tf.truncated_normal(mean=0, stddev=0.01,
                                                                                           shape=(layers_structure[i], layers_structure[i + 1])),
                                                            dtype=tf.float32)
                tf.summary.histogram(name=layer_name + '/Weights', values=weights['W' + str(i + 1)])
                
            with tf.name_scope('Biases'):        
                weights['b' + str(i + 1)] = tf.get_variable(name=("b" + str(i)), 
                                                            initializer=tf.zeros(shape=layers_structure[i + 1]), 
                                                            dtype=tf.float32)
                tf.summary.histogram(name=layer_name + '/Biases', values=weights['b' + str(i + 1)])
                
        with tf.name_scope('UserSpecific'):
            weights['V'] = tf.get_variable(name="V", 
                                           initializer=tf.truncated_normal(shape=[n_users, n_hidden], mean=0, stddev=0.03),
                                           dtype=tf.float32)            
            tf.summary.histogram(name=layer_name + '/UserSp', values=weights['V'])
            
    return weights

def DAE(y, v, weights, n_items, n_users, n_hidden, dropout_prob, transfer_fn, output_fn):
    with tf.name_scope('DAE'):
        y_tilde = tf.nn.dropout(y, dropout_prob) 
        z = activation(tf.add_n([tf.matmul(y, weights['W']),
                                 tf.reshape(v, [1, n_hidden]), 
                                 tf.reshape(weights['b'], [1, n_hidden])]), transfer_fn) 
        y_hat = activation(tf.add(tf.matmul(z, tf.transpose(weights['W_prime'])), 
                                  tf.reshape(weights['b_prime'], [1, n_items])), output_fn)                    
        # tf.constant(y, dtype=tf.float32, shape=[1, n_items])
        # z = activation(tf.add_n([tf.matmul(y, weights['W1']),
        #                          tf.reshape(u, [1, n_hidden]), 
        #                          tf.reshape(weights['b1'], [1, n_hidden])]), transfer_fn) 
        # y_hat = activation(tf.add(tf.matmul(z, weights['W2']), 
        #                           tf.reshape(weights['b2'], [1, n_items])), output_fn)        
    return y_tilde, y_hat
        
def DAE_V2(corrupted_y, v, layers_structure, weights, n_items, n_users, n_hidden, dropout_prob, 
           transfer_fn, output_fn):            
    layers = int(len(layers_structure) / 2)
    y_tilde = tf.nn.dropout(corrupted_y, dropout_prob)
    with tf.name_scope('Encoder'):
        for layer in range(1, layers + 1):
            z = activation(tf.add(tf.add(tf.matmul(y_tilde, weights['W' + str(layer)]), v), 
                                  tf.reshape(weights['b' + str(layer)], [1, n_hidden])), transfer_fn)
            #z = activation(tf.add_n([tf.matmul(y_tilde, weights['W' + str(layer)]), 
            #                         v, tf.reshape(weights['b' + str(layer)], [1, n_hidden])]), 
            #               transfer_fn)   
    with tf.name_scope('Decoder'):
        for layer in range(layers + 1, 2 * layers + 1):        
            if layer == 2 * layers:
                y_hat = activation(tf.add(tf.matmul(z, weights['W' + str(layer)]), 
                                      tf.reshape(weights['b' + str(layer)], [n_items])), output_fn, name='Prediction') 
            else:
                y_hat = activation(tf.add(tf.matmul(z, weights['W' + str(layer)]), 
                                      tf.reshape(weights['b' + str(layer)], [n_items])), output_fn)
    return y_tilde, y_hat

def get_loss(corrupted_y, y_hat, weights, layers_structure, v_user, reg_lambda, _augmented_o_mask, loss_fn):
    real_batch_size = tf.cast(tf.shape(corrupted_y)[0], tf.int32)
    U = tf.cast(real_batch_size, tf.float32)
    reg_loss = tf.constant(0, dtype=tf.float32)
    
    if loss_fn == 'cross_entropy_loss':
        p = tf.nn.sigmoid(y_hat)
        #temp_p = tf.clip_by_value(y_hat, 1e-8, tf.reduce_max(y_hat))
        first_loss = -1 * tf.multiply(corrupted_y, tf.log(p)) - tf.multiply((1 - corrupted_y) , tf.log(1 - p))   
        first_loss = np.multiply(first_loss, _augmented_o_mask)
        tmpfirst_loss = tf.reduce_sum(first_loss) / U
        for itr in range(1, len(layers_structure)):        
            reg_loss = tf.add(reg_loss, tf.add(tf.nn.l2_loss(weights['W' + str(itr)]), 
                                               tf.nn.l2_loss(weights['b' + str(itr)])))            
        reg_loss = reg_lambda * 0.5 * (reg_loss + tf.nn.l2_loss(batch_V))
        loss = tmpfirst_loss + reg_loss 
        
    elif loss_fn == 'square_loss':              
        square_loss = tf.nn.l2_loss(tf.multiply(tf.subtract(corrupted_y, y_hat), _augmented_o_mask))
        loss = square_loss / U
        
    elif loss_fn == 'log_loss':    
        logloss = tf.multiply(tf.log(1 + tf.exp(tf.multiply(-corrupted_y, y_hat))), _augmented_o_mask)
        loss = tf.reduce_sum(logloss) / U
        
    elif loss_fn == 'hinge_loss':
        hingeloss = tf.multiply(1 - tf.multiply(corrupted_y, y_hat), _augmented_o_mask)
        loss = tf.maximum(0.0, tf.reduce_sum(hingeloss)) / U
        
    else: 
        raise ValueError('Unknown non-linearity type')        
    return loss

def augmented_0_set(ith_data, neg_sample_ratio=5):
    pos_examples_indices = np.where(ith_data > 0)[0]
    neg_examples_indices = np.where(ith_data == 0)[0]
    aug_set = {index for index in pos_examples_indices} 
    if len(neg_examples_indices) <= neg_sample_ratio * len(pos_examples_indices):
        for index in neg_examples_indices:
            aug_set.add(index)
    else:
        for index in np.random.permutation(neg_examples_indices)[0:neg_sample_ratio * len(pos_examples_indices)]:
            aug_set.add(index)
    return aug_set

def batch_augmented_O_set(ith_data, neg_sample_ratio=5):
    batch_augmented_O_mask = []
    for i in range(ith_data.shape[0]):
        pos_examples_indices = np.where(ith_data[i] > 0)[0]
        neg_examples_indices = np.where(ith_data[i] == 0)[0]
        aug_set = {index for index in pos_examples_indices} 
        if len(neg_examples_indices) <= neg_sample_ratio * len(pos_examples_indices):
            for index in neg_examples_indices:
                aug_set.add(index)
        else:
            for index in np.random.permutation(neg_examples_indices)[0:neg_sample_ratio * len(pos_examples_indices)]:
                aug_set.add(index)
        augmented_O_mask = np.zeros(ith_data.shape[1])
        augmented_O_mask[list(aug_set)] = 1         
        batch_augmented_O_mask.append(augmented_O_mask)    
    return np.array(batch_augmented_O_mask)


In [None]:
def get_optimizer(loss):  
    optimizer = tf.train.AdagradOptimizer(ADA_LEARNING_RATE, ADA_BETA)
    grad_and_vars = optimizer.compute_gradients(loss)
    capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in grad_and_vars]        
    cost_gradient_apply = optimizer.apply_gradients(capped_gvs)

In [15]:
tf.reset_default_graph()
with tf.Graph().as_default() as g:
    tf.set_random_seed(202109)
    with g.device('/cpu:0'):
            train_dataset = load_data(files_list=training_list, features_name=data_schema['DataSchema'][:-1], 
                                      batch_size=NUM_BATCH_SIZE, epochs=NUM_EPOCHS, is_training=1, name='train')    
            valid_dataset = load_data(files_list=validation_list, features_name=data_schema['DataSchema'][:-1], 
                                      batch_size=NUM_BATCH_SIZE, epochs=NUM_EPOCHS, is_training=0, name='valid') 
            iterator = tf.data.Iterator.from_structure(train_dataset().output_types, 
                                                   train_dataset().output_shapes)    
            batch_index, batch_features, batch_mask = iterator.get_next()
            train_init_op = iterator.make_initializer(train_dataset())        
            valid_init_op = iterator.make_initializer(valid_dataset())
            
    with g.device('/gpu:0'):
        with tf.name_scope('Placeholder'):
            Y = tf.placeholder(tf.float32, [None, N_ITEMS], name='Inputs')
            v_node = tf.placeholder(tf.int32, [None, N_ITEMS], name='UserNode')
            corrupted_mask = tf.placeholder(tf.int32, [None, N_ITEMS], name='Mask')
            #input_ids = tf.placeholder(tf.int32, [None], name='UserId')
            #batch_size = tf.cast(tf.shape(Y)[0], tf.int32)
            augmented_O_mask = tf.placeholder(tf.float32, [None, N_ITEMS], name='AugmentedMask')
            global_step = tf.Variable(tf.constant(0), trainable=False, name='GlobalStepCounter') 
        
        # Compute graph
        corrupted_Y = tf.multiply(Y, tf.cast(corrupted_mask, tf.float32))            
        with tf.variable_scope('Hyperparameters', reuse=tf.AUTO_REUSE):
            all_weights = initialize_weights_v3(layers_structure=LAYERS, 
                                                n_users=N_USERS, 
                                                n_hidden=N_HIDDEN)                            
        
        with tf.name_scope('DAE'):
            #batch_V = tf.reshape(tf.gather(all_weights['V'], input_ids, name='get_V'), [batch_size, N_HIDDEN])
            batch_V = tf.matmul(tf.cast(v_node, tf.float32), all_weights['V'])
            Y_tilde, Y_hat = DAE_V2(corrupted_y=corrupted_Y, v=batch_V, layers_structure=LAYERS, 
                                    weights=all_weights, n_items=N_ITEMS, n_users=N_USERS, n_hidden=N_HIDDEN, 
                                    dropout_prob=DROPOUT_PROB, transfer_fn='sigmoid', output_fn='sigmoid')
    
        with tf.name_scope('Loss'):
            final_loss = get_loss(corrupted_y=corrupted_Y, y_hat=Y_hat, weights=all_weights, 
                                  layers_structure=LAYERS, v_user=batch_V, reg_lambda=REG_LAMBDA,
                                  _augmented_o_mask=augmented_O_mask, loss_fn='square_loss')
            tf.summary.scalar(name='Loss', tensor=final_loss)
            
    #with g.device('/gpu:0'):        
        with tf.name_scope('Optimizer'):       
            learning_rate = tf.train.exponential_decay(START_LEARNING_RATE, global_step, 1000, 1.96, staircase=False)
            tf.summary.scalar(name='LR', tensor=learning_rate)
            optimizer = tf.train.AdagradOptimizer(learning_rate, ADA_BETA)
            
            grad_and_vars = optimizer.compute_gradients(final_loss)
            #gradients, variables = zip(*optimizer.compute_gradients(final_loss))
            
            gradients = [x[0] for x in grad_and_vars]
            variables = [x[1] for x in grad_and_vars]
            if gradient_clip > 0:
                clipped, _ = tf.clip_by_global_norm(gradients, gradient_clip)                
            else:
                clipped = [tf.clip_by_value(grad, -1., 1.) for grad in gradients]
                #clipped = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grad_and_vars]
            
            cost_gradient_apply = optimizer.apply_gradients(zip(clipped, variables), global_step=global_step)
            #cost_gradient_apply = optimizer.apply_gradients(grads_and_vars=capped_gvs, global_step=global_step)
            

In [16]:
config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
config.gpu_options.allow_growth = True

with tf.Session(graph=g, config=config) as sess:
    export_dir = os.path.join(ckpt_dir, 'export')
    
    random_user_set = np.random.permutation(N_USERS)
    old_avg_cost = None
    model_improved = True
    epoch = 0
    total_train_steps = 999999 #math.floor(N_USERS / NUM_BATCH_SIZE)
    total_valid_steps = 999999
    
    # Best validation loss seen so far.
    best_valid_loss = 99999
    # Iteration-number for last improvement to validation loss.
    last_improvement = 0
    # Stop optimization if no improvement found in this many iterations.
    require_improvement = 3
    
    merged_summary = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(tensorboard_train_path, graph=sess.graph,
                                         filename_suffix='CDAE')
    test_writer = tf.summary.FileWriter(tensorboard_test_path, graph=sess.graph,
                                        filename_suffix='CDAE')
    
    init = tf.global_variables_initializer()
    start_time = time.time()
    print('Initialing all Variables...')
    sess.run(init)
    print('Initialing all variables done! Process time:{:.10}s'.format(time.time() - start_time))
    print('Start Training...')           
    
    while epoch < NUM_EPOCHS:
        print('=== Epoch: {:2d} ==='.format(epoch))
        train_sum_cost = 0
        valid_sum_cost = 0
        #user_v_index_dict= dict()

        start_time = time.time()
        sess.run(train_init_op)
        for i in range(total_train_steps): 
            try:
                train_labels, train_features, train_user_node = sess.run([batch_index, batch_features, batch_mask])
                #v_index = list(range((train_features.shape[0] * i), (train_features.shape[0] * (i + 1))))
                #for k, v in enumerate(train_labels):
                #    user_v_index_dict[v] = (NUM_BATCH_SIZE * i) + k
                mask_corruption = np.random.binomial(1, 1 - CORRUPTION_LEVEL, (train_features.shape[0], N_ITEMS))
                mask_corruption_np = np.array(mask_corruption, dtype=np.float32) 
                maskt_augmented_O = batch_augmented_O_set(train_features)
            
                train_loss, _ = sess.run([final_loss, cost_gradient_apply], 
                                         feed_dict={Y: train_features, 
                                                    corrupted_mask: mask_corruption_np,
                                                    v_node: train_user_node,
                                                    #input_ids: v_index,
                                                    augmented_O_mask: maskt_augmented_O})
                train_sum_cost += train_loss
                if i % 1000 == 0 and i > 0:
                    summary = sess.run(merged_summary, feed_dict={Y: train_features, 
                                                                  corrupted_mask: mask_corruption_np,
                                                                  v_node: train_user_node,
                                                                  augmented_O_mask: maskt_augmented_O})    
                    train_writer.add_summary(summary, epoch * total_train_steps + i)
                    print('Training-Iteration: {:4d}, loss: {:.8f}'.format(i, train_sum_cost / i))
                    if np.isnan(train_loss):
                        break                    
            except tf.errors.OutOfRangeError:
                print('Iter: {:4d} is end of line.'.format(i))
                break
        
        end_time = time.time()
        train_avg_cost = train_sum_cost / i
        #summary = sess.run(merged_summary, feed_dict={Y: train_features, 
        #                                              corrupted_mask: mask_corruption_np,
        #                                              v_node: train_user_node,
        #                                              #input_ids: v_index,
        #                                              augmented_O_mask: maskt_augmented_O})    
        #train_writer.add_summary(summary, epoch) # epoch * total_train_steps + i
        
        # Validation
        sess.run(valid_init_op)
        for j in range(total_valid_steps):         
            try: 
                valid_labels, valid_features, valid_user_node = sess.run([batch_index, batch_features, batch_mask])
                #v_index = list(range((valid_features.shape[0] * j), (valid_features.shape[0] * (j + 1))))
                mask_corruption = np.random.binomial(1, 1 - CORRUPTION_LEVEL, (valid_features.shape[0], N_ITEMS))
                mask_corruption_np = np.array(mask_corruption, dtype=np.float32) 
                maskt_augmented_O = batch_augmented_O_set(valid_features)
            
                valid_loss, _ = sess.run([final_loss, cost_gradient_apply], 
                                         feed_dict={Y: valid_features, 
                                                    corrupted_mask: mask_corruption_np,
                                                    v_node: valid_user_node,
                                                    #input_ids: v_index,
                                                    augmented_O_mask: maskt_augmented_O})
                valid_sum_cost += valid_loss
                if j % 1000 == 0 and j > 0:
                    print('Testing-Iteration: {:4d}, loss: {:.8f}'.format(j, valid_sum_cost / j))
                    valid_summary = sess.run(merged_summary, feed_dict={Y: valid_features, 
                                                                        corrupted_mask: mask_corruption_np,
                                                                        v_node: valid_user_node,
                                                                        augmented_O_mask: maskt_augmented_O})    
                    test_writer.add_summary(valid_summary, epoch * total_train_steps + i)
                    if np.isnan(valid_loss):
                        break  
            except tf.errors.OutOfRangeError:
                print('Iter: {:4d} is end of line.'.format(j))
                break
                
        end_time = time.time()
        valid_avg_cost = valid_sum_cost / j
        #valid_summary = sess.run(merged_summary, feed_dict={Y: valid_features, 
        #                                                    corrupted_mask: mask_corruption_np,
        #                                                    v_node: valid_user_node,
        #                                                    #input_ids: v_index,
        #                                                    augmented_O_mask: maskt_augmented_O})    
        #test_writer.add_summary(valid_summary, epoch) # epoch * total_train_steps + i
        
        # If validation accuracy is an improvement over best-known.
        if valid_avg_cost < best_valid_loss:
            best_valid_loss = valid_avg_cost                            
            last_improvement = epoch
            #saver.save(sess, os.path.join(ckpt_dir, 'DeepRecommender.ckpt'), global_step=epoch * total_train_steps + i)
            inputs = g.get_tensor_by_name('Placeholder/Inputs:0')
            inputs_user_node = g.get_tensor_by_name('Placeholder/UserNode:0')
            inputs_mask = g.get_tensor_by_name('Placeholder/Mask:0')
            inputs_augmented_mask = g.get_tensor_by_name('Placeholder/AugmentedMask:0')
            prediction = g.get_tensor_by_name('DAE/Decoder/Prediction:0')            
            
            model_input = {
                'inputs': tf.saved_model.utils.build_tensor_info(inputs),
                'v_node': tf.saved_model.utils.build_tensor_info(inputs_user_node),
                'corrupted_mask': tf.saved_model.utils.build_tensor_info(inputs_mask),
                'augmented_O_mask': tf.saved_model.utils.build_tensor_info(inputs_augmented_mask)
            }
            
            model_output = {
                'outputs': tf.saved_model.utils.build_tensor_info(prediction)
            }
            # Build signature definition
            signature = tf.saved_model.signature_def_utils.build_signature_def(
                inputs=model_input,
                outputs=model_output,
                method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
            )
            
            if os.path.exists(export_dir):
                shutil.rmtree(export_dir, ignore_errors=True)
            builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
            builder.add_meta_graph_and_variables(
                sess, [tf.saved_model.tag_constants.SERVING], 
                signature_def_map={
                    tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature
                }, strip_default_attrs=True)
            builder.save()
            improved_str = '*'

        if epoch - last_improvement > require_improvement:
            print("No improvement found in a while, stopping optimization.")
            break
            
        #model_improved = True if old_avg_cost == None or old_avg_cost > avg_cost else False       
        print('#### Epoch:{}, Train Cost:{:.8f}, Valid Cost:{:.8f}, Processing time:{:.5f}s'.format(epoch, 
                                                                                                     train_avg_cost,
                                                                                                     valid_avg_cost,
                                                                                                     end_time - start_time))
        epoch += 1                 

        

Initialing all Variables...
Initialing all variables done! Process time:0.06098985672s
Start Training...
=== Epoch:  0 ===
Training-Iteration: 1000, loss: 11.09694900
Training-Iteration: 2000, loss: 11.09650890
Training-Iteration: 3000, loss: 11.12478720
Training-Iteration: 4000, loss: 11.11947479
Training-Iteration: 5000, loss: 11.12444935
Training-Iteration: 6000, loss: 11.11838916
Training-Iteration: 7000, loss: 11.09638240
Training-Iteration: 8000, loss: 11.06739746
Iter: 8862 is end of line.
Testing-Iteration: 1000, loss: 9.71549552
Testing-Iteration: 2000, loss: 9.57642125
Testing-Iteration: 3000, loss: 9.47565736
Iter: 3709 is end of line.
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b'/home/yuyuliao/tensorflow_ckpt/CDAE/export/saved_model.pb'
#### Epoch:0, Train Cost:11.02598670, Valid Cost:9.41682449, Processing time:240.62308s


# Use SavedModel module to prediction

In [17]:
HOME = Path(os.environ['HOME'])
tensorboard_path = Path(HOME) / 'tensorboard_files'
ckpt_dir = str(HOME / 'tensorflow_ckpt' / 'CDAE')
export_dir = os.path.join(ckpt_dir, 'export')

In [18]:
%%time
tf.reset_default_graph()
pred_config = tf.ConfigProto(allow_soft_placement = True)

with tf.Graph().as_default() as pred_g: 
    test_dataset = load_data(files_list=validation_list, features_name=data_schema['DataSchema'][:-1], 
                              batch_size=NUM_BATCH_SIZE, epochs=NUM_EPOCHS, is_training=0, name='test') 
    test_iterator = tf.data.Iterator.from_structure(test_dataset().output_types, 
                                               test_dataset().output_shapes)    
    batch_index, batch_features, batch_mask = test_iterator.get_next()
    test_init_op = test_iterator.make_initializer(test_dataset())    

    
with tf.Session(config=pred_config, graph=pred_g) as pred_sess:
    total_test_steps = 99999
    meta_graph_def = tf.saved_model.loader.load(pred_sess, [tf.saved_model.tag_constants.SERVING], 
                                                export_dir)
    signature = meta_graph_def.signature_def
    signature_dict = signature[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
    x_tensor_name = signature_dict.inputs['inputs'].name
    v_node_tensor_name = signature_dict.inputs['v_node'].name
    corrupted_mask_tensor_name = signature_dict.inputs['corrupted_mask'].name
    augmented_O_mask_tensor_name = signature_dict.inputs['augmented_O_mask'].name
    y_tensor_name = signature_dict.outputs['outputs'].name
    pred_sess.run(test_init_op)
    #graph = tf.get_default_graph()
    #graph_nodes_name = [n.name for n in graph.as_graph_def().node]
    prediction_recoder = []
    X = pred_sess.graph.get_tensor_by_name(x_tensor_name)
    v_node = pred_sess.graph.get_tensor_by_name(v_node_tensor_name)
    corrupted_mask = pred_sess.graph.get_tensor_by_name(corrupted_mask_tensor_name)
    augmented_O_mask = pred_sess.graph.get_tensor_by_name(augmented_O_mask_tensor_name)    
    Y = pred_sess.graph.get_tensor_by_name(y_tensor_name)
    for _ in range(total_test_steps):
        try:
            test_labels, test_features, test_user_node = pred_sess.run([batch_index, batch_features, batch_mask])
            mask_corruption = np.random.binomial(1, 1 - CORRUPTION_LEVEL, (test_features.shape[0], N_ITEMS))
            mask_corruption_np = np.array(mask_corruption, dtype=np.float32) 
            maskt_augmented_O = batch_augmented_O_set(test_features)
            
            decode_test_labels = [l.decode('utf-8') for l in test_labels]            
            predicted_result = pred_sess.run(Y, feed_dict={
                X: test_features, 
                v_node: test_user_node, 
                corrupted_mask: mask_corruption_np, 
                augmented_O_mask: maskt_augmented_O
            })  
            prediction_recoder.append(list(zip(decode_test_labels, np.round(predicted_result, 6))))
        except tf.errors.OutOfRangeError:
            break

print('Num of iteration:{}, total predicted count: {}'.format(len(prediction_recoder), 
                                                              len(prediction_recoder) * NUM_BATCH_SIZE))            

INFO:tensorflow:Restoring parameters from b'/home/yuyuliao/tensorflow_ckpt/CDAE/export/variables/variables'
Num of iteration:3709, total predicted count: 474752
CPU times: user 7min 22s, sys: 9.27 s, total: 7min 32s
Wall time: 1min 4s


# Distributed representations of weight version
From another point of view, W i and V u can be seen as the distributed representations of item i and 
user u respectively.


In [None]:
def xavier_init(node_in, node_out, constant=1):
    low = -constant * np.sqrt( 6 / (node_in + node_out))
    high = constant * np.sqrt( 6 / (node_in + node_out))
    return tf.random_uniform((node_in, node_out), minval=low, maxval=high, dtype=tf.float32)

class CDAE:
    def __init__(self, input_class, sparse=True):
        #self.input_data = input_class(sparse=sparse)
        self.input_data = input_class
        self.sparse = sparse
        self.n_items  = self.input_data.get_n_items()
        self.n_users  = self.input_data.get_n_users()

    def _init_variables(self, n_hidden=50):
        start_time = time.time()
        print('Initial weights...')
        W = xavier_init(self.n_items, n_hidden)
        W_prime = xavier_init(self.n_items, n_hidden)
        all_weights = dict()
        all_weights['W'] = [tf.Variable(W[i]) for i in range(self.n_items)]
        all_weights['W_prime'] = [tf.Variable(W_prime[i]) for i in range(self.n_items)]
        all_weights['b'] = tf.Variable(tf.zeros([1, n_hidden], dtype=tf.float32))
        all_weights['b_prime'] = [tf.Variable(tf.constant(0, dtype=tf.float32), dtype=tf.float32) for _ in range(self.n_items)]
        all_weights['V'] = [tf.Variable(tf.zeros([n_hidden])) for _ in range(self.n_users)]
        print('Initial weights done! Processing time:{:.10}s'.format(time.time() - start_time))
        return all_weights
    
    def _set_up_training_ops(self, **kwargs):
        print('Create graph...')
        # Init parameters
        dropout_prob = kwargs.get('dropout_prob') if kwargs.get('dropout_prob') else 0.2
        transfer_fn = kwargs.get('transfer_fn') if kwargs.get('transfer_fn') else tf.sigmoid # hidden layer
        output_fn = kwargs.get('output_fn') if kwargs.get('output_fn') else tf.identity # output layer
        n_hidden = kwargs.get('n_hidden') if kwargs.get('n_hidden') else 50
        
        # AdaGrad Parameters
        ada_beta = kwargs.get('ada_beta') if kwargs.get('ada_beta') else 1.0
        ada_learning_rate = kwargs.get('ada_learning_rate') if kwargs.get('ada_learning_rate') else 0.01
        
        optimizer = tf.train.AdagradOptimizer(ada_learning_rate, ada_beta)
        #optimizer = tf.train.GradientDescentOptimizer(ada_learning_rate)
        
        # Regularization Parameter
        reg_lambda = kwargs.get('reg_lambda') if kwargs.get('reg_lambda') else 1.0
        
        # Model ops
        self.y = tf.placeholder(tf.float32, [None, self.n_items])
        self.z = lambda u: transfer_fn(tf.add_n([tf.matmul(self.y, self.weight['W']),
                                                 tf.reshape(self.weight['V'][u], [1, n_hidden]), # (1, 50)
                                                 tf.reshape(self.weight['b'], [1, n_hidden])])) # (1, 50)
        self.y_hat = lambda u: output_fn(tf.add(tf.matmul(self.z(u), 
                                                          tf.transpos(self.weight['W_prime'])), # (1, 5551)
                                                tf.reshape(self.weight['b_prime'], [1, self.n_items]))) # (1, 16980)
        
        self.loss = lambda u: tf.nn.l2_loss(tf.subtract(self.y, self.y_hat(u)))
        
        # Gradient Ops
        self.dLoss_d = lambda u, var: optimizer.compute_gradients(self.loss(u), var_list=[var])[0][0]
        self.dCost_d = lambda u, var: tf.multiply(1.0 / self.n_users, self.dLoss_d(u, var)) - tf.multiply(reg_lambda, var)
        self.cost_gradient_apply = lambda u, var: optimizer.apply_gradients([[self.dCost_d(u, var), var]])
        print('Create graph done!')
        self.init_op = tf.global_variables_initializer()
    
    def _create_augmented_0_set(self, ith_data, neg_sample_ratio=5):
        print('Create augmented data set...')
        pos_example_indices = np.where(ith_data > 0)[0]
        neg_example_indices = np.where(ith_data == 0)[0]
        aug_set = {index for index in pos_example_indices}
        if len(neg_example_indices) <= neg_sample_ratio * len(pos_example_indices):
            for index in neg_example_indices:
                aug_set.add(index)
        else:
            for index in np.random.permutation(neg_example_indices)[0:neg_sample_ratio * len(pos_example_indices)]:
                aug_set.add(index)
        print('Create augmented data set done!')
        return aug_set    
    
    def train(self, **kwargs):
        MAX_ITERATION = kwargs.get('max_iteration') if kwargs.get('max_iteration') else 1000
        dropout_prob = kwargs.get('dropout_prob') if kwargs.get('dropout_prob') else 0.2
        n_hidden = kwargs.get('n_hidden') if kwargs.get('n_hidden') else 50
        dropout_prob = tf.constant(dropout_prob, dtype=tf.float32)
        iteration = 0

        self.weight = self._init_variables(n_hidden) 
        self._set_up_training_ops(**kwargs)
        
        old_avg_cost = None
        model_improved = True
        input_data = self.input_data.get_data()
        sess = tf.Session()
        start_time = time.time()
        print('Initialing all Variables...')
        sess.run(self.init_op)
        print('Done! Process time:{:.10}s'.format(time.time() - start_time))
        print('Start Training... ')
        
        while iteration < MAX_ITERATION or model_improved: 
            print('=== Iter:{} ==='.format(iteration))
            old_avg_cost = avg_cost if iteration >= 1 else None 
            sum_cost = 0
            
            for u in range(self.n_users):
                val_data = input_data[u].get_full_data() # the data value incloud 0 and 1
                train_data = input_data[u].get_train_data()
                    
                if self.sparse:
                    try:
                        val_data = sess.run(tf.sparse_tensor_to_dense(val_data))
                        train_data = sess.run(tf.sparse_tensor_to_dense(train_data))
                    except:
                        print('user: {}'.format(u))
                # using Equation 9.
                y_tilde = sess.run(tf.nn.dropout(tf.constant(train_data, 
                                                             dtype=tf.float32, 
                                                             shape=[1, self.n_items]), dropout_prob))
                augmented_0_set = self._create_augmented_0_set(np.array(val_data))
                    
                for item in augmented_0_set:
                    sess.run(self.cost_gradient_apply(u, self.weight['W_prime'][item]), feed_dict={self.y: y_tilde})
                    sess.run(self.cost_gradient_apply(u, self.weight['b_prime'][item]), feed_dict={self.y: y_tilde})
                
                for item in np.where(y_tilde == 1):
                    sess.run(self.cost_gradient_apply(u, self.weight['W'][item]), feed_dict={self.y: y_tilde})
                    
                sess.run(self.cost_gradient_apply(u, self.weight['V'][u]), feed_dict={self.y: y_tilde})
                sess.run(self.cost_gradient_apply(u, self.weight['b']), feed_dict={self.y: y_tilde})
                sum_cost += sess.run(self.loss(u), feed_dict={self.y: val_data})
            avg_cost = sum_cost / self.n_users
            model_improved = True if old_avg_cost == None or old_avg_cost > avg_cost else False
            print('Iteration: {}, Average cost per User:{}'.format(i, cost))
            iteration += 1
        sess.close()
    
    def _count_num_accurate_classification(self, ground_turth, recommendations):
        ''' 
        |Intersection(C-adopted, C-NRecommended)|
        
        :param List ground_turth  : Actual items that user liked
        :param List recommendation: Items the system recommended to the user
        
        :return Int               : Length of intersection between system recommended items and 
                                    items the user actually liked 
        '''
        num_accurate_predictions = 0
        for item in recommendations:
            if item in ground_turth:
                num_accurate_predictions += 1
        return num_accurate_predictions        
    
    def precision(self, actual, predicted, N=5):
        ''' 
        Calculates precision defined by: |Intersection(C-adopted, C-NRecommended)| / N
        
        :param List actual   : Actual items that were liked
        :param List predicted: Prediction of actual items
        :param Int  N        : Number of items to be recommended (default=5)

        :return Float        : Number b/w 0 to 1 corresponding to the precision of the recommendation list
        '''
        if len(predicted) > N:
            predicted = predicted[:N]
        num_accurate_predictions = self._count_num_accurate_classification(actual, predicted)
        return float(num_accurate_predictions) / N    
            
    def recall(self, actual, predicted, N=5):
        ''' 
        Calculates recall defined by: |Intersection(C-adopted, C-NRecommended)| / C-adopted
        
        :param List actual   : Actual items that were liked
        :param List predicted: Prediction of actual items, 
        :param Int  N        : Number of items to be recommended (default=5)

        :return Float        : Number b/w 0 to 1 corresponding to the recall of the recommendation list
        '''
        num_accurate_predictions = self._count_num_accurate_classification(actual, predicted)
        return float(num_accurate_predictions) / len(actual)
        
    
    def recommend(self, u, Y, N=5, test_set=False):
        '''
        Method recommend outputs the top-N item recommendations for user u
           
        :param Int u                : Id for user to be recommended
        :param AbstractInputSource Y: User data class that encapsulates training, test, and full data
        :param Int N                : Number of items to be recommended
        :param Boolean test_set     : Necessary flag for checking the accuracy of the training set to the test set (defualt=Fasle)

        :return List                : List: Top N list of items recommend to user u  
        '''        
        
        if test_set:
            recommendation_candidates = Y.get_neg_indices()
            
            if self.sparse:
                train_data = self.sess.run(tf.sparse_tensor_to_dense(Y.get_train_data()))
            else:
                train_data = Y.get_train_data()
            feed_dict = {self.y: train_data}
        else:
            recommendation_candidates = np.where(Y==0)
            feed_dict = {self.y: Y}
        y_hat = self.sess.run(self.y_hat(u), feed_dict=feed_dict)
        top_items_sorted = sorted(recommendation_candidates, key=lambda i: y_hat[i], reverse=True)
        
        return top_items_sorted[:N] if len(top_items_sorted) > N else top_items_sorted
        
    def average_precision(self, actual, u, Y, N):
        '''
        Average Precision at N is defined as: sum(Precision@k * Rel(K), k, 1, N) / min(N, |C-adopted|)
        
        :param List actual   : Actual items that were liked
        :param Int u                : Represents user u 
        :param AbstractInputSource Y: User data class that encapsulates training, test, and full data
        :param Int N                : Number of items to be recommended

        :return Float               : Number b/w 0 to 1 corresponding to the Average precision of the recommendation list
        '''
        min_N_adopted = min(N, len(actual))
        ground_turth = Y.get_test_data()
        predicted = self.recommend(u, Y, N=N, test_set=True)
        
        sum_precision = 0
        for k in range(1, N+1):
            if predicted[k] in ground_turth:
                sum_precision += self.precision(ground_turth, predicted, N=k)
            else:
                sum_precision += 0.0
                
        return sum_precision / min_N_adopted

    def mean_average_precision(self, N):
        '''
        Average of AP for all users
        
        :param Int N : Number of items to be recommended
        
        :return Float: Number b/w 0 to 1 corresponding to the Mean Average Precision@N
        '''
        
        sum_ap = 0
        for i, user in enumerate(self.input_data.get_data()):
            sum_ap += self.average_precision(i, user, N)
            
        return sum_ap / len(self.users)
        
        
            

In [None]:
#cdae_model = CDAE(CiteYouLikeA, sparse=False)
cdae_model = CDAE(input_data)
cdae_model.train()

# Use SavedModel module to prediction

In [None]:
tensorboard_path = Path(HOME) / 'tensorboard_files'
ckpt_dir = str(HOME / 'tensorflow_ckpt' / 'DeepRcommenderOneAD')
export_dir = os.path.join(ckpt_dir, 'export')

In [None]:
%%time
tf.reset_default_graph()
config = tf.ConfigProto(allow_soft_placement = True)

with tf.Graph().as_default() as g: 
    valid_dataset = load_data(files_list=validation_list, features_name=data_schema['DataSchema'][:-1], 
                              batch_size=NUM_BATCH_SIZE, epochs=NUM_EPOCHS, is_training=0) 
    iterator = tf.data.Iterator.from_structure(valid_dataset().output_types, 
                                               valid_dataset().output_shapes)    
    batch_index, batch_features = iterator.get_next()
    valid_init_op = iterator.make_initializer(valid_dataset())    

    
with tf.Session(config=config, graph=g) as pred_sess:
    total_valid_steps = 99999
    meta_graph_def = tf.saved_model.loader.load(pred_sess, [tf.saved_model.tag_constants.SERVING], export_dir)
    signature = meta_graph_def.signature_def
    x_tensor_name = signature[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].inputs['inputs'].name
    y_tensor_name = signature[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs['outputs'].name

    #graph = tf.get_default_graph()
    #graph_nodes_name = [n.name for n in graph.as_graph_def().node]
    pred_sess.run(valid_init_op)
    prediction_recoder = []
    X = pred_sess.graph.get_tensor_by_name(x_tensor_name)
    Y = pred_sess.graph.get_tensor_by_name(y_tensor_name)
    for _ in range(total_valid_steps):
        try:
            valid_labels, valid_features = pred_sess.run([batch_index, batch_features])
            decode_valid_labels = [l.decode('utf-8') for l in valid_labels]            
            predicted_result = pred_sess.run(Y, feed_dict={X: valid_features})                         
            prediction_recoder.append(list(zip(decode_valid_labels, predicted_result)))
        except tf.errors.OutOfRangeError:
            break

print('Num of iteration:{}, total predicted count: {}'.format(len(prediction_recoder), 
                                                              len(prediction_recoder) * NUM_BATCH_SIZE))            

In [None]:
%%time
import multiprocessing

def get_remaind_list(_data, _target_uuid_list):
    remained_uuid_list = []
    for k, v in _data:
        if k in _target_uuid_list:
            remained_uuid_list.append((k, v))
    # [(k, v) for k, v in _data if k in _target_uuid_list]            
    return remained_uuid_list

target_uuid_list = test_data.parse_uuid.values.tolist()
pool_size = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=pool_size)
result = []
for d in prediction_recoder:
    result.append(pool.apply_async(get_remaind_list, args=(d, target_uuid_list, )))
pool.close()
pool.join()

get_process_result = []
for res in result:
    get_process_result.append(res.get(10))

In [None]:
%%time
N = 5 # len(data_schema['DailyVideoQueue'])
has_recommend_list = []
for i, line in enumerate(get_process_result):
    for uuid, pred_matrix in line:
        #actual_data = test_data[test_data.parse_uuid == uuid].iloc[:, :-1].values[0]
        recommendation_candidates  = np.where(pred_matrix > 0)[0]
        top_items_sorted = sorted(recommendation_candidates, key=lambda i: pred_matrix[i], reverse=True)
        top_items_socre = [pred_matrix[i] for i in top_items_sorted]
        top_recommended_items = top_items_sorted[:N] if len(top_items_sorted) > N else top_items_sorted
        recommended_items_name = [data_schema['DataSchema'][i] for i in top_recommended_items]
        #recommend_video_list = [v for v in recommended_items_name if int(v) in data_schema['DailyVideoQueue']]
        recommend_video_list = [v for v in recommended_items_name if v in data_schema['DataSchema']]
        if len(recommend_video_list) != 0:
            has_recommend_list.append((uuid, recommend_video_list, top_items_socre))

In [None]:
%%time
pred_uuid_list = [uuid for uuid, _, _ in has_recommend_list]
target_uuid_list = test_data.parse_uuid.values.tolist()
target_uuid_dict = {}
for k, v in enumerate(target_uuid_list):
    target_uuid_dict[v] = k
    
remaind_index_list = [target_uuid_dict[uuid] for uuid in pred_uuid_list]
#remaind_index_list = [target_uuid_list.index(uuid) for i, uuid in enumerate(pred_uuid_list) if uuid in target_uuid_list]
print('Remained numnber of index: {}'.format(len(remaind_index_list)))

## Netflix version

In [None]:
with tf.Session(graph=g, config=config) as sess:
    random_user_set = np.random.permutation(N_USERS)
    old_avg_cost = None
    model_improved = True
    epoch = 0
    total_train_steps = math.floor(N_USERS / NUM_BATCH_SIZE)
    
    # Best validation loss seen so far.
    best_valid_loss = 99999
        
    # Iteration-number for last improvement to validation loss.
    last_improvement = 0

    # Stop optimization if no improvement found in this many iterations.
    require_improvement = 2
    
    merged_summary = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(tensorboard_train_path, graph=sess.graph,
                                         filename_suffix='CDAE')
    test_writer = tf.summary.FileWriter(tensorboard_test_path, graph=sess.graph,
                                        filename_suffix='CDAE')
    
    init = tf.global_variables_initializer()
    start_time = time.time()
    print('Initialing all Variables...')
    sess.run(init)
    print('Initialing all variables done! Process time:{:.10}s'.format(time.time() - start_time))
    print('Start Training... ')           
        
    while epoch <= NUM_EPOCHS:
        print('=== Epoch: {}, Total batch iter:{} ==='.format(epoch, total_train_steps))
        sum_cost = 0
        start_time = time.time()
        for i, mini_batch in enumerate(iterate_one_epoch(train_data, NUM_BATCH_SIZE, N_ITEMS)):            
            mask_corruption = np.random.binomial(1, 1 - CORRUPTION_LEVEL, (NUM_BATCH_SIZE, N_ITEMS))
            mask_corruption_np = np.array(mask_corruption, dtype=np.float32) 
            maskt_augmented_O = create_augmented_O_set(mini_batch['data'])
            
            train_loss, _ = sess.run([tmp_loss, cost_gradient_apply], 
                                     feed_dict={Y: mini_batch['data'], 
                                                corrupted_mask: np_mask_corruption,
                                                input_ids: mini_batch['index'],
                                                augmented_O_mask: maskt_augmented_O})
            sum_cost += train_loss
            if i % 50 == 0 and i > 0:
                print('Iteration: {:4d}, loss: {:.8f}'.format(i, train_loss))
                summary = sess.run(merged_summary, feed_dict={Y: mini_batch['data'], 
                                                              corrupted_mask: np_mask_corruption,
                                                              input_ids: mini_batch['index'],
                                                              augmented_O_mask: maskt_augmented_O})    
                train_writer.add_summary(summary, epoch * total_train_steps + i)
                if np.isnan(train_loss):
                    break        
            
        end_time = time.time()
        avg_cost = sum_cost / i
        
        # Validation
        for i, mini_batch in enumerate(iterate_one_epoch_valid(train_data, NUM_BATCH_SIZE, N_ITEMS)):            
            mask_corruption = np.random.binomial(1, 1 - CORRUPTION_LEVEL, (NUM_BATCH_SIZE, N_ITEMS))
            mask_corruption_np = np.array(mask_corruption, dtype=np.float32) 
            maskt_augmented_O = create_augmented_O_set(mini_batch['data'])
            
            train_loss, _ = sess.run([tmp_loss, cost_gradient_apply], 
                                     feed_dict={Y: mini_batch['data'], 
                                                corrupted_mask: np_mask_corruption,
                                                input_ids: mini_batch['index'],
                                                augmented_O_mask: maskt_augmented_O})
            sum_cost += train_loss
            if i % 50 == 0 and i > 0:
                print('Iteration: {:4d}, loss: {:.8f}'.format(i, train_loss))
                summary = sess.run(merged_summary, feed_dict={Y: mini_batch['data'], 
                                                              corrupted_mask: np_mask_corruption,
                                                              input_ids: mini_batch['index'],
                                                              augmented_O_mask: maskt_augmented_O})    
                train_writer.add_summary(summary, epoch * total_train_steps + i)
                if np.isnan(train_loss):
                    break        
            
        end_time = time.time()
        avg_cost = sum_cost / i
        
        # If validation accuracy is an improvement over best-known.
        if avg_cost < best_valid_loss:
            best_valid_loss = avg_cost                            
            last_improvement = epoch
            saver.save(sess, os.path.join(ckpt_dir, 'DeepRecommender.ckpt'), 
                       global_step=epoch * total_train_steps + i)
            improved_str = '*'        

        if epoch - last_improvement > require_improvement:
            print("No improvement found in a while, stopping optimization.")
            break
            
        #model_improved = True if old_avg_cost == None or old_avg_cost > avg_cost else False       
        print('#### Epoch: {}, Aerage Cost per User:{:.10f}, Processing time:{:.5f}s'.format(epoch, 
                                                                                             avg_cost, 
                                                                                             end_time - start_time))
        epoch += 1                 

        