In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
import numpy as np
import builtins
import keras
import tensorflow as tf
from keras import backend as K
from extra_keras_metrics import average_precision_at_k
from IPython.display import SVG
from keras.utils.vis_utils import plot_model
import pydot as pyd
from keras.utils.vis_utils import model_to_dot
from keras.layers import Input, Dense, Embedding, Lambda, Reshape, Flatten, Average
from keras.models import Model
from keras.backend import mean, max
from keras.preprocessing.sequence import pad_sequences

keras.utils.vis_utils.pydot = pyd

In [47]:
import data_generator

In [6]:
tf.__version__

'1.13.1'

In [7]:
class RecoDNN():
    
    def __init__(self, max_transaction_history = 50, max_product_click_history = 50, max_promotion_click_history = 50,
                 category_size = 100, single_categorical_features = None, numeric_features_size = 10,
                 hidden_layer1_size = 256, hidden_layer2_size = 128, hidden_layer3_size = 64, activation='relu',
                input_embedding_size = 64):
        
        self.max_transaction_history = max_transaction_history
        self.max_product_click_history = max_product_click_history
        self.max_promotion_click_history = max_promotion_click_history
        self.category_size = category_size
        self.hidden_layer1_size = hidden_layer1_size
        self.hidden_layer2_size = hidden_layer2_size
        self.hidden_layer3_size = hidden_layer3_size
        self.single_categorical_features = single_categorical_features
        self.numeric_features_size = numeric_features_size
        self.activation = activation
        self.input_embedding_size = input_embedding_size
        
        self.category_embeddings = Embedding(output_dim=self.input_embedding_size, input_dim = self.category_size, 
                       input_length = builtins.max(self.max_transaction_history, self.max_product_click_history, self.max_promotion_click_history), mask_zero=True, name='category_embeddings')
        
        self.build()
        
    
    def build(self):
        
        inp_layer, inp_embed = self.create_input()
        
        v = Dense(self.hidden_layer1_size, activation = self.activation)(keras.layers.concatenate(inp_embed)) 
        v = Dense(self.hidden_layer2_size, activation = self.activation)(v)
        v = Dense(self.hidden_layer3_size, activation = self.activation, name='user_embedding')(v)
        output = Dense(self.category_size, activation ='softmax', name='softmax_layer')(v)
        self.model = Model(inputs = inp_layer, outputs = [output])    
        
    
    def create_input(self):
        
        transaction_cols = [x for x in range(self.max_transaction_history)]
        product_click_cols = [x for x in range(self.max_product_click_history)]
        promotion_click_cols = [x for x in range(self.max_promotion_click_history)]
        seq_category_cols = [transaction_cols, product_click_cols, promotion_click_cols]
        
        seqs = []
        for i, grp in enumerate(seq_category_cols):
            seqs.append(self.seq_categorical_input('seq_categorical_' + str(i), len(grp)))

        singles = []
        if self.single_categorical_features:
            for col in self.single_categorical_features:
                singles.append(self.singe_categorical_input(str(col), self.single_categorical_features[col]))

        nums = self.continous_inputs(self.numeric_features_size)

        inp_layer =  [s[0] for s in seqs]
        inp_layer += [s[0] for s in singles]
        inp_layer.append(nums[0])
        inp_embed = [s[1] for s in seqs]
        inp_embed += [s[1] for s in singles]
        inp_embed.append(nums[1])
               
        return inp_layer, inp_embed
    
    
    def seq_categorical_input(self, name, max_history):
    
        seq = Input(shape=(max_history,), dtype='int32', name=name)
        input_embeddings = self.category_embeddings(seq)
        avg = Lambda(lambda x: mean(x, axis=1), name= name + '_avg_embedding')
        avg_embedding = avg(input_embeddings)

        maxf = Lambda(lambda x: max(x, axis=1), name = name + '_max_embedding')
        max_embedding = maxf(input_embeddings)

        return seq, avg_embedding   #keras.layers.add([avg_embedding, max_embedding])

    
    def singe_categorical_input(self, name, unique_size):
        single = Input(shape=(1,), dtype='int32', name=name)
        embeddings = Embedding(output_dim = self.input_embedding_size, input_dim = unique_size, 
                           input_length=1, name=name + '_embedding')(single)
        embeddings = Flatten(name = 'flatten_' + name)(embeddings)
        return single, embeddings
    
    def continous_inputs(self, size=None, name='numeric'):
        inp = Input(shape=(size,), dtype='float32', name=name)
        return inp, inp


# fake dataset

In [30]:
data_size = 10000
max_transaction_history = 50
max_product_click_history = 50
max_promotion_click_history = 50
input_embedding_size = 64
category_size = 100
numeric_size = 10

data1 = np.random.randint(category_size, size=(data_size, max_transaction_history))
data2 = np.random.randint(category_size, size=(data_size, max_product_click_history))
data3 = np.random.randint(category_size, size=(data_size, max_promotion_click_history))
inputs = [data1, data2, data3]

single_category_cols = {105:3,106:5,107:10}   ## such as location : unique_value_size
for k in single_category_cols:
    inputs.append(np.random.randint(single_category_cols[k], size=(data_size, 1)))

num1 = np.random.random(size=(data_size, numeric_size))
inputs.append(num1)

labels = np.random.randint(category_size, size=(data_size, 1))
one_hot_labels = keras.utils.to_categorical(labels, num_classes=category_size)


## save locally

In [9]:
inputs1 = inputs.copy()
inputs1.append(labels)
merge = np.hstack(inputs1)
np.savetxt('../data/features.csv', merge, fmt="%2.3f")

(10000, 164)


In [61]:
dataset = tf.data.Dataset.from_tensor_slices((merge,))

In [65]:
dataset.output_shapes

(TensorShape([Dimension(164)]),)

In [66]:
dataset = dataset.batch(32).repeat()

# Training

In [48]:
model = RecoDNN(max_transaction_history,max_product_click_history, max_promotion_click_history, category_size,
                numeric_features_size = numeric_size, input_embedding_size = input_embedding_size,
                single_categorical_features = single_category_cols).model

In [49]:
model.compile(loss='categorical_crossentropy',
                       optimizer='adam',
                       metrics=['accuracy'])

In [50]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
seq_categorical_0 (InputLayer)  (None, 50)           0                                            
__________________________________________________________________________________________________
seq_categorical_1 (InputLayer)  (None, 50)           0                                            
__________________________________________________________________________________________________
seq_categorical_2 (InputLayer)  (None, 50)           0                                            
__________________________________________________________________________________________________
105 (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
106 (Input

In [51]:
#plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

In [53]:
#SVG(model_to_dot(model).create(prog='dot', format='svg'))


In [None]:
#%time model.fit(x=inputs, y=one_hot_labels, epochs=20, batch_size=32)

In [69]:
%time model.fit(x=dataset, epochs=20, batch_size=64, steps_per_epoch=10000//64)

AttributeError: 'DatasetV1Adapter' object has no attribute 'ndim'

In [57]:
generator=data_generator.data_generator('../data/features.csv', 64, [50, 100, 150], [150,151,152], [153])


In [58]:
#generator=data_generator.DataGenerator('../data/features.csv', 10000, 64, 100, [50, 100, 150], [150,151,152], [153])

In [59]:
%time model.fit_generator(generator,steps_per_epoch=10000//64, verbose=1, epochs=50, shuffle=True, use_multiprocessing=True, workers=15)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 1min 23s, sys: 50.5 s, total: 2min 13s
Wall time: 40.8 s


<keras.callbacks.History at 0x12eaa22e8>