In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from collections import OrderedDict
from tensorflow.python.keras.layers import Input
import tensorflow as tf
from utils import padding_process
from input import create_embedding_dict,embedding_lookup
import feature_columns as fc_lib
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import LSTM, Lambda, Layer
from tensorflow.python.keras.initializers import Zeros, glorot_normal
from tensorflow.python.keras.regularizers import l2
from collections import defaultdict

from model import DNN,AttentionSequencePoolingLayer
from utils import activation_layer,concat_func
from feature_columns import build_input_features,SparseFeat,VarLenSparseFeat

#tf.config.experimental_run_functions_eagerly(True)


## 读取数据

In [None]:
## get config
train_path="data/local_train_splitByUser"
test_path="data/local_test_splitByUser"

f=open("data/uid_voc.pkl","rb")
uid_dict=pickle.load(f)
f=open("data/mid_voc.pkl","rb")
mid_dict=pickle.load(f)
f=open("data/cat_voc.pkl","rb")
cate_dict=pickle.load(f)

## 数据处理

In [14]:

def get_xy_fd(path):
    df=pd.read_csv(path,sep="\t",header=None).head(300000)
    df.columns=['label','uid','item_id','cate_id','seq_item','seq_cate']
    df['seq_len']=df['seq_item'].map(lambda t: 10 if len(t.split("\002"))>=10 else len(t.split("\002")))
    df['seq_item']=df['seq_item'].map(lambda t:  padding_process(t.split("\002"),maxlen))
    df['seq_cate']=df['seq_cate'].map(lambda t: padding_process(t.split("\002"),maxlen))
    
    df['uid']=df.uid.map(lambda t: uid_dict.get(t,9999))
    df['item_id']=df.item_id.map(lambda t: mid_dict.get(t))
    df['cate_id']=df.cate_id.map(lambda t: cate_dict.get(t))
    df['seq_item']=df.seq_item.map(lambda t:[mid_dict.get(i) if i!=0 else i for i in t])
    df['seq_cate']=df.seq_cate.map(lambda t:[cate_dict.get(i) if i!=0 else i for i in t])
    
    uid=df.uid.values
    iid=df.item_id.values
    cid=df.cate_id.values
    hist_iid=np.array(df.seq_item.tolist())
    hist_cid=np.array(df.seq_cate.tolist())
    seq_length=df.seq_len.values
  
    feature_dict = {'user': uid,  'item_id': iid, 'cate_id': cid,
                    'hist_item_id': hist_iid, 'hist_cate_id': hist_cid,
                    'seq_length': seq_length}
    features = list(build_input_features(feature_columns).keys())
    x = {name: feature_dict[name] for name in features}
    y = df.label.values
    return x, y

x_train, y_train = get_xy_fd(train_path)
x_test, y_test = get_xy_fd(test_path)


## 定义输入参数

In [6]:
#定义每个特征的特征空间
user_len = 543060
item_len = 367983
cate_len = 1601
#定义序列的长度
maxlen=10

#定义特征类型
feature_columns = [SparseFeat('user', user_len+1, embedding_dim=16), 
                       SparseFeat('item_id', item_len+3, embedding_dim=16), 
                       SparseFeat('cate_id', cate_len + 3, embedding_dim=16)]
feature_columns += [
    VarLenSparseFeat('hist_item_id', item_len + 3, embedding_dim=16, embedding_name='item_id', maxlen=10, length_name="seq_length"),
    VarLenSparseFeat('hist_cate_id', cate_len + 3, embedding_dim=16, embedding_name='cate_id', maxlen=10,length_name="seq_length")]

#定义特征输入
dnn_feature_columns=feature_columns
sparse_feature_columns = dnn_feature_columns
features = build_input_features(dnn_feature_columns)

#定义query和key的columns和类型
query_columns=['item_id','cate_id']
key_columns=['hist_item_id','hist_cate_id']
query_feature_columns = [i for i in dnn_feature_columns if i.name in query_columns]
key_feature_columns = [i for i in dnn_feature_columns if i.name in key_columns]

#定义模型的输入tensor
inputs_list = list(features.values())

[<tf.Tensor 'user_7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'item_id_7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'cate_id_7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'hist_item_id_7:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'seq_length_15:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'hist_cate_id_7:0' shape=(None, 10) dtype=float32>]

In [25]:
embedding_dict

{'user': <tensorflow.python.keras.layers.embeddings.Embedding at 0x167d856a0>,
 'item_id': <tensorflow.python.keras.layers.embeddings.Embedding at 0x167d85b38>,
 'cate_id': <tensorflow.python.keras.layers.embeddings.Embedding at 0x167d85908>}

In [27]:
mbedding_dict = create_embedding_dict(dnn_feature_columns,1024, 1e-6)
query_emb_list = embedding_lookup(embedding_dict, features, query_feature_columns, to_list=True)
keys_emb_list = embedding_lookup(embedding_dict, features, key_feature_columns,to_list=True)
dnn_input_emb_list = embedding_lookup(embedding_dict, features, sparse_feature_columns,1,to_list=True)

keys_emb_sum_list = embedding_lookup(embedding_dict, features, key_feature_columns,1,to_list=True)
#获取attention得分
keys_emb =concat_func(keys_emb_list)
deep_input_emb = concat_func(dnn_input_emb_list)
query_emb = concat_func(query_emb_list)

keys_emb_sum=concat_func(keys_emb_sum_list)

In [28]:
query_emb

<tf.Tensor 'concatenate_10/Identity:0' shape=(None, 1, 32) dtype=float32>

## 构建din模型

In [7]:
def build_din_model():
    embedding_dict = create_embedding_dict(dnn_feature_columns,1024, 1e-6)
    query_emb_list = embedding_lookup(embedding_dict, features, query_feature_columns, to_list=True)
    keys_emb_list = embedding_lookup(embedding_dict, features, key_feature_columns,to_list=True)
    dnn_input_emb_list = embedding_lookup(embedding_dict, features, sparse_feature_columns,1,to_list=True)
    
    keys_emb_sum_list = embedding_lookup(embedding_dict, features, key_feature_columns,1,to_list=True)
    #获取attention得分
    keys_emb =concat_func(keys_emb_list)
    deep_input_emb = concat_func(dnn_input_emb_list)
    query_emb = concat_func(query_emb_list)
    
    keys_emb_sum=concat_func(keys_emb_sum_list)
    
    att_hidden_size=(80, 40,1)
    att_activation="dice"
    att_weight_normalization=False
    hist = AttentionSequencePoolingLayer(att_hidden_size, att_activation,
                                         weight_normalization=att_weight_normalization, supports_masking=True)([query_emb, keys_emb])
    #构造dnn模型
    dnn_input=deep_input_emb
    print("hist-------------")
    print(query_emb)
    print(keys_emb)
    #deep_input_emb=tf.keras.layers.Concatenate()([deep_input_emb, query_emb*keys_emb_sum,hist])
    #deep_input_emb=tf.keras.layers.Concatenate()([deep_input_emb, query_emb*keys_emb_sum])
    #deep_input_emb=tf.keras.layers.Concatenate()([deep_input_emb, hist])
    print(deep_input_emb)
    #deep_input_emb=tf.keras.layers.Flatten()(deep_input_emb)
    print(deep_input_emb)
    dnn_hidden_units=(256,128,64)
    dnn_activation="dice"
    l2_reg_dnn=0
    dnn_dropout=0
    seed=1024
    dnn_use_bn=False
    
    output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed=seed)(deep_input_emb)
    output = tf.keras.layers.Dense(1, use_bias=False,kernel_initializer=tf.keras.initializers.glorot_normal(seed))(output)
    output = tf.sigmoid(output)
    output = tf.reshape(output, (-1, 1))
    print(output)
    print(inputs_list)
    model = tf.keras.models.Model(inputs=inputs_list, outputs=output)
    model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])
    return model

In [8]:
for i in range(1):
    print("--------------")
    print(i)
    '''
    for j in range(1,4):
    '''
    #print(j)
    model=build_din_model()
    history = model.fit(x_train, y_train,  batch_size=256,verbose=1, epochs=1)
    #history = model.fit(x_train, y_train,  batch_size=128,verbose=1, epochs=1)
    y_pred=model.predict(x_test)
    print(roc_auc_score(y_test,y_pred))

--------------
0
查看query和key的大小
Tensor("concatenate_6/Identity:0", shape=(None, 1, 32), dtype=float32)
Tensor("concatenate_4/Identity:0", shape=(None, 10, 32), dtype=float32)
查看attention dnn的输入向量大小
Tensor("attention_sequence_pooling_layer_1/concat_1:0", shape=(None, 10, 128), dtype=float32)
查看attention dnn的输出向量大小
Tensor("attention_sequence_pooling_layer_1/dnn_1/Identity:0", shape=(None, 10, 1), dtype=float32)
查看序列的每个元素的权重大小
Tensor("attention_sequence_pooling_layer_1/SelectV2:0", shape=(None, 1, 10), dtype=float32)
查看序列的大小
Tensor("concatenate_4/Identity:0", shape=(None, 10, 32), dtype=float32)
查看权重*序列后的大小
Tensor("attention_sequence_pooling_layer_1/MatMul:0", shape=(None, 1, 32), dtype=float32)
hist-------------
Tensor("concatenate_6/Identity:0", shape=(None, 1, 32), dtype=float32)
Tensor("concatenate_4/Identity:0", shape=(None, 10, 32), dtype=float32)
Tensor("concatenate_5/Identity:0", shape=(None, 1, 80), dtype=float32)
Tensor("concatenate_5/Identity:0", shape=(None, 1, 80), dtype=floa

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


0.7063235859083515
