In [393]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [356]:
import numpy as np
import builtins
import keras
import tensorflow as tf
import math
from extra_keras_metrics import average_precision_at_k

In [357]:
import random
import keras_metrics as km

In [358]:
import sys
sys.path.append('..')

In [359]:
from data_loader import data_generator

In [370]:
import os

In [384]:
os.environ['TF_KERAS'] = '1'

In [385]:
os.environ['TF_KERAS']

'1'

In [360]:
tf.__version__

'1.14.0'

In [423]:
#from keras_pos_embd import PositionEmbedding, TrigPosEmbedding
import trig_pos_embd   # https://github.com/CyberZHG/keras-pos-embd
import pos_embd

In [390]:
keras_pos_embd.backend.backend

<module 'keras.backend' from '/Users/pengcheng.jia/anaconda3/envs/reco/lib/python3.6/site-packages/keras/backend/__init__.py'>

In [391]:
tf.keras.backend

<module 'tensorflow.python.keras.api._v1.keras.backend' from '/Users/pengcheng.jia/anaconda3/envs/reco/lib/python3.6/site-packages/tensorflow/python/keras/api/_v1/keras/backend/__init__.py'>

In [424]:
class RecoDNN():
    
    def __init__(self, max_transaction_history = 50, max_product_click_history = 50, max_promotion_click_history = 50,
                 category_size = 100, single_categorical_features = None, numeric_features_size = 10,
                 hidden_layer1_size = 256, hidden_layer2_size = 128, hidden_layer3_size = 64, activation='relu',
                input_embedding_size = 128):
        
        self.max_transaction_history = max_transaction_history
        self.max_product_click_history = max_product_click_history
        self.max_promotion_click_history = max_promotion_click_history
        self.category_size = category_size
        self.hidden_layer1_size = hidden_layer1_size
        self.hidden_layer2_size = hidden_layer2_size
        self.hidden_layer3_size = hidden_layer3_size
        self.single_categorical_features = single_categorical_features
        self.numeric_features_size = numeric_features_size
        self.activation = activation
        self.input_embedding_size = input_embedding_size
        
        self.category_embeddings = tf.keras.layers.Embedding(output_dim=self.input_embedding_size, 
                                                             input_dim = self.category_size, mask_zero=True, name='category_embeddings')
        
        self.build()
        
    
    def build(self):
        
        inp_layer, inp_embed = self.create_input()
        v = tf.keras.layers.Dense(512, activation = self.activation)(tf.keras.layers.concatenate(inp_embed)) 
        v = tf.keras.layers.Dense(self.hidden_layer1_size, activation = self.activation)(v) 
        v = tf.keras.layers.Dense(self.hidden_layer2_size, activation = self.activation)(v)
        v = tf.keras.layers.Dense(self.hidden_layer3_size, activation = self.activation, name='user_embedding')(v)
        output = tf.keras.layers.Dense(self.category_size, activation ='softmax', name='softmax_layer')(v)
        self.model = tf.keras.models.Model(inputs = inp_layer, outputs = [output])    
        
    
    def create_input(self):
        
        transaction_cols = [x for x in range(self.max_transaction_history)]
        product_click_cols = [x for x in range(self.max_product_click_history)]
        promotion_click_cols = [x for x in range(self.max_promotion_click_history)]
        seq_category_cols = [transaction_cols, product_click_cols, promotion_click_cols]
        
        seqs = []
        for i, grp in enumerate(seq_category_cols):
            seqs.append(self.seq_categorical_input('seq_categorical_' + str(i), len(grp)))

        singles = []
        if self.single_categorical_features:
            for col in self.single_categorical_features:
                singles.append(self.singe_categorical_input(str(col), self.single_categorical_features[col][0],
                                                           self.single_categorical_features[col][1]))

        nums = self.continous_inputs(self.numeric_features_size)

        inp_layer =  [s[0] for s in seqs]
        inp_layer += [s[0] for s in singles]
        inp_layer.append(nums[0])
        inp_embed = [s[1] for s in seqs]
        inp_embed += [s[1] for s in singles]
        inp_embed.append(nums[1])
               
        return inp_layer, inp_embed
    
    
    def seq_categorical_input(self, name, max_history):
    
        seq = tf.keras.layers.Input(shape=(max_history,), dtype='int32', name=name)
        category_embeddings = tf.keras.layers.Embedding(output_dim=self.input_embedding_size,input_dim = self.category_size, mask_zero=True, name=name+'category_embeddings')
        input_embeddings = category_embeddings(seq)
        #position_embeddings = trig_pos_embd.TrigPosEmbedding(output_dim=self.input_embedding_size,mode=TrigPosEmbedding.MODE_ADD)
        position_embeddings = pos_embd.PositionEmbedding(input_dim=self.category_size, output_dim=self.input_embedding_size,mode=PositionEmbedding.MODE_ADD)

        input_embeddings = position_embeddings(input_embeddings)
        avg_embedding = tf.keras.layers.GlobalAveragePooling1D(name=name + '_avg_embedding')(input_embeddings)
        
        #cat_embeddings = tf.reshape(input_embeddings, [, -1])

        #maxf = tf.keras.layers.Lambda(lambda x: tf.keras.backend.max(x, axis=1), name = name + '_max_embedding')
        #max_embedding = maxf(input_embeddings)

        return seq, avg_embedding
        #return seq, tf.keras.layers.Flatten(name = 'flatten_' + name)(input_embeddings)   #keras.layers.add([avg_embedding, max_embedding])
        #return seq, tf.keras.layers.concatenate([avg_embedding, max_embedding])

    
    def singe_categorical_input(self, name, unique_size, embedding_size):
        single = tf.keras.layers.Input(shape=(1,), dtype='int32', name=name)
        embeddings = tf.keras.layers.Embedding(output_dim = embedding_size, input_dim = unique_size, 
                           input_length=1, name=name + '_embedding')(single)
        embeddings = tf.keras.layers.Flatten(name = 'flatten_' + name)(embeddings)
        return single, embeddings
    
    def continous_inputs(self, size=None, name='numeric'):
        inp = tf.keras.layers.Input(shape=(size,), dtype='float32', name=name)
        return inp, inp


# fake dataset

In [379]:
data_size = 5000
max_transaction_history = 20
max_product_click_history = 20
max_promotion_click_history = 20
input_embedding_size = 16
category_size = 50
numeric_size = 1

data1 = np.random.randint(category_size, size=(data_size, random.randint(1,max_transaction_history)))
data1 = tf.keras.preprocessing.sequence.pad_sequences(data1, max_transaction_history, padding='post')
#data1 = np.array([[1,2], [1,2,4]])

data2 = np.random.randint(category_size, size=(data_size, max_product_click_history-3))
data2 = tf.keras.preprocessing.sequence.pad_sequences(data2, max_product_click_history, padding='post')

data3 = np.random.randint(category_size, size=(data_size, max_promotion_click_history-4))
data3 = tf.keras.preprocessing.sequence.pad_sequences(data3, max_promotion_click_history, padding='post')

inputs = [data1, data2, data3]

single_category_cols = {105:(3,8),106:(5,8),107:(10,8)}   ## such as location : unique_value_size
for k in single_category_cols:
    inputs.append(np.random.randint(single_category_cols[k][0], size=(data_size, 1)))

num1 = np.random.random(size=(data_size, numeric_size))
inputs.append(num1)

labels = np.random.randint(0, category_size-1, size=(data_size, 1))
one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=category_size)

In [380]:
inputs[0]

array([[ 6, 23, 23, ...,  0,  0,  0],
       [24, 25, 26, ...,  0,  0,  0],
       [36, 35, 40, ...,  0,  0,  0],
       ...,
       [17, 25, 20, ...,  0,  0,  0],
       [ 2, 19, 10, ...,  0,  0,  0],
       [36, 41, 48, ...,  0,  0,  0]], dtype=int32)

# Training

In [149]:
# learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.1
    drop = 0.8
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

In [192]:
def top_2(y_true, y_pred):
    return tf.keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2) 

In [292]:
def tf_loss_wrapper(y_true, y_pred, loss):
    score, up_opt = loss(tf.cast(y_true,tf.int64), tf.cast(y_pred,tf.float32), k=2) 
    tf.keras.backend.get_session().run(tf.local_variables_initializer())
    with tf.control_dependencies([up_opt]):
        score = tf.identity(score)
    return score

In [303]:
def average_precision_at_k(y_true, y_pred):
    score, up_opt = tf.metrics.average_precision_at_k(tf.cast(y_true,tf.int64), tf.cast(y_pred,tf.float32), k=2) 
    tf.keras.backend.get_session().run(tf.local_variables_initializer())
    with tf.control_dependencies([up_opt]):
        score = tf.identity(score)
    return score

In [344]:
def precision_at_2(y_true, y_pred):
    return tf.metrics.precision_at_k(tf.cast(y_true,tf.int64), tf.cast(y_pred,tf.float32), k=2) 

In [345]:
def precision_at_1(y_true, y_pred):
    score, up_opt = tf.metrics.precision_at_k(tf.cast(y_true,tf.int64), tf.cast(y_pred,tf.float32), k=1) 
    tf.keras.backend.get_session().run(tf.local_variables_initializer())
    with tf.control_dependencies([up_opt]):
        score = tf.identity(score)
    return score

In [346]:
def overlapping_at_2(y_true, y_pred):
    pass

In [425]:
model = RecoDNN(max_transaction_history, max_product_click_history, max_promotion_click_history, category_size,
                numeric_features_size = numeric_size, input_embedding_size = input_embedding_size,
                single_categorical_features = single_category_cols).model

In [426]:
model.compile(loss='sparse_categorical_crossentropy',
                       optimizer=tf.keras.optimizers.Adam(0.001),
                       metrics=['accuracy', precision_at_2])

In [427]:
#model.summary()

In [428]:
#tf.keras.utils.plot_model(model, to_file='../figures/model.png', show_shapes=True, show_layer_names=True)

In [429]:
tf.keras.backend.get_session().run(tf.local_variables_initializer())

In [None]:
%time model.fit(x=inputs, y=labels, epochs=200, batch_size=64, callbacks=[], validation_split=0.2)

In [None]:
model.save('../outputs/reco_dnn.h5')  # save model 

# infere user embedding

In [132]:
lmodel = tf.keras.models.load_model('../outputs/reco_dnn.h5')

In [133]:
layer_name = 'user_embedding'
intermediate_layer_model = tf.keras.Model(inputs=lmodel.input,
                                 outputs=lmodel.get_layer(layer_name).output)


In [134]:
sz=3  # use first three rows as inputs
dat = inputs[0][0:sz], inputs[1][0:sz],inputs[2][0:sz],inputs[3][0:sz],inputs[4][0:sz],inputs[5][0:sz],inputs[6][0:sz]

In [135]:
intermediate_output = intermediate_layer_model.predict(dat)

In [136]:
intermediate_output   # these are embeddings for users

array([[ 0.       ,  0.       ,  4.400964 ,  0.       ,  6.740262 ,
         0.       , 11.438016 ,  6.269281 ,  1.6220105,  5.1103024,
         0.       , 10.057777 ,  2.1798482,  7.217703 ,  2.2895126,
         0.       ,  5.0285788,  1.2717619,  0.       , 11.747034 ,
        14.142303 , 20.50268  ,  2.7235174,  0.       ,  1.0890514,
         7.7049932,  6.673367 , 10.590452 ,  8.015552 , 20.846992 ,
        16.123056 , 18.163578 ,  4.228752 ,  0.       ,  1.5433388,
         0.6095442,  0.       ,  0.       ,  0.       , 11.783399 ,
         1.5408026,  0.       ,  5.787854 ,  0.       ,  8.5177765,
         0.       , 11.734498 ,  0.       , 25.783195 ,  1.3734124,
        11.107173 ,  4.8844457,  1.2880889,  0.       ,  7.4841986,
         9.729629 ,  7.0148015,  6.7179275,  2.5550754, 16.474016 ,
         0.       ,  0.       ,  3.206796 ,  0.       ],
       [ 6.6823826,  0.       ,  2.0572307,  0.       , 10.38165  ,
         0.       ,  4.9173765,  5.3423514, 12.363356 ,  3.

In [262]:
lmodel.predict(dat)

array([[9.88500866e-14, 1.66809485e-07, 8.02611992e-13, 5.57731146e-05,
        2.01628845e-20, 3.55871374e-11, 4.18647318e-07, 8.19514900e-10,
        6.91004449e-17, 3.52575065e-04, 1.46854493e-13, 1.39596227e-06,
        5.18735532e-12, 1.07548644e-10, 3.26124949e-09, 2.33259256e-04,
        3.78430604e-13, 7.89409770e-12, 5.96693326e-08, 2.84212961e-14,
        5.26116650e-10, 6.93088948e-20, 2.22797962e-06, 3.25721611e-12,
        4.92618014e-15, 4.56983652e-11, 1.02549454e-03, 5.16495277e-07,
        3.06632064e-05, 2.56538044e-07, 4.08812921e-05, 2.99763379e-08,
        3.87134548e-07, 5.26721378e-09, 9.98016841e-14, 6.25430082e-04,
        2.86690261e-14, 1.62516951e-08, 6.75517142e-11, 2.02315914e-05,
        2.32272708e-12, 1.97077854e-10, 5.13877219e-10, 3.10492964e-04,
        1.84082262e-06, 9.97060716e-01, 2.27206576e-04, 9.92866080e-06,
        2.66013101e-12, 1.99611918e-31],
       [4.68875896e-07, 1.37154132e-07, 9.82510414e-07, 1.38129623e-08,
        3.57998857e-08,