In [1]:
import pandas as pd

# 数据链接：https://www.kaggle.com/datasets/leonerd/criteo-small
df = pd.read_csv('train_1m.txt', sep='\t', header=None)

In [2]:
df.columns = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(14, 40)]

In [3]:
df = df.sample(n=10000).reset_index(drop=True)

数据预处理

In [4]:
dense_feats = [f for f in df.columns if f.startswith('I')]
sparse_feats = [f for f in df.columns if f.startswith('C')]

In [5]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [6]:
def process_dense_feats(data,feats):
    d  = data.copy()
    d = d[feats].fillna(0.0)
    for f in feats:
        d[f] = d[f].apply(lambda x: np.log(x+1) if x>-1 else -1)
    return d

data_dense = process_dense_feats(df, dense_feats)

def process_sparse_feats(data,feats):
    d = data.copy()
    d = d[feats].fillna('-1')
    for f in feats:
        d[f] = LabelEncoder().fit_transform(d[f])
    return d

data_sparse = process_sparse_feats(df, sparse_feats)

total_data = pd.concat([data_dense,data_sparse],axis=1)
total_data['label'] = df['label']

构建dense层的一阶特征

In [7]:
import tensorflow as tf
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K 
from tensorflow.keras.models import Model

In [8]:
dense_inputs = []
for f in dense_feats:
    input_ = Input([1], name=f)
    dense_inputs.append(input_)

In [9]:
concat_dense_inputs = Concatenate(axis=1)(dense_inputs)

In [10]:
fst_order_dense_layer = Dense(1)(concat_dense_inputs)

构建sparse层的特征

In [11]:
sparse_inputs = []
for f in sparse_feats:
    _input = Input([1],name=f)
    sparse_inputs.append(_input)

In [13]:
sparse_ld_embed = []
for i,_input in enumerate(sparse_inputs):
    f = sparse_feats[i]
    voc_size  = df[f].nunique()
    reg = tf.keras.regularizers.l2(0.5)
    _embed = Embedding(voc_size+1, 1, embeddings_regularizer=reg)(_input)
    # 由于 Embedding 的结果是二维的，
    # 因此如果需要在 Embedding 之后加入 Dense 层，则需要先连接上 Flatten 层
    _embed = Flatten()(_embed)
    sparse_ld_embed.append(_embed)

In [14]:
fst_order_sparse_layer = Add()(sparse_ld_embed)

Linear 部分合并

In [15]:
linear_part = Add()([fst_order_dense_layer, fst_order_sparse_layer])

二阶特征

In [16]:
k = 8  # embeding size 
# 只考虑sparse的二阶交叉
sparse_kd_embed = []
for i,_input in enumerate(sparse_inputs):
    f = sparse_feats[i]
    voc_size = df[f].nunique()
    _embed = Embedding(voc_size+1, k, embeddings_regularizer=tf.keras.regularizers.l2(0.7))(_input)
    sparse_kd_embed.append(_embed)

In [17]:
concat_sparse_kd_embed = Concatenate(axis=1)(sparse_kd_embed)

In [18]:
# 先求和再平方
sum_kd_embed = Lambda(lambda x: K.sum(x,axis=1))(concat_sparse_kd_embed)
square_sum_kd_embed = Multiply()([sum_kd_embed, sum_kd_embed])

In [19]:
#  先平方再求和
square_kd_embed = Multiply()([concat_sparse_kd_embed, concat_sparse_kd_embed])
sum_square_kd_embed = Lambda(lambda x: K.sum(x,axis=1))(square_kd_embed)

In [20]:
# 相减除以2
sub = Subtract()([square_sum_kd_embed,sum_square_kd_embed])
sub = Lambda(lambda x: x*0.5)(sub)
snd_order_sparse_layer = Lambda(lambda x: K.sum(x, axis=1,keepdims=True))(sub)

DNN

In [21]:
flatten_sparse_embed = Flatten()(concat_sparse_kd_embed)
fc_layer = Dropout(0.5)(Dense(256,activation='relu')(flatten_sparse_embed)) 
fc_layer = Dropout(0.3)(Dense(256,activation='relu')(fc_layer))
fc_layer = Dropout(0.1)(Dense(256,activation='relu')(fc_layer))
fc_layer_output = Dense(1)(fc_layer)

输出结果

In [22]:
output_layer = Add()([linear_part, snd_order_sparse_layer, fc_layer_output])
output_layer = Activation('sigmoid')(output_layer)

In [23]:
model = Model(dense_inputs + sparse_inputs, output_layer)

In [24]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['binary_crossentropy',tf.keras.metrics.AUC(name='auc')])

In [29]:
total_data.shape

(10000, 40)

In [30]:
train_data = total_data.loc[:8000]
valid_data = total_data.loc[8000:]

In [31]:
train_dense_x = [train_data[f].values for f in dense_feats]
train_sparse_x = [train_data[f].values for f in sparse_feats]

train_label = [train_data['label'].values]

val_dense_x = [valid_data[f].values for f in dense_feats]
val_sparse_x = [valid_data[f].values for f in sparse_feats]

val_label = [valid_data['label'].values]

In [32]:
model.fit(train_dense_x + train_sparse_x, train_label,
          epochs=5, batch_size=64,
         validation_data = (val_dense_x + val_sparse_x, val_label),
         )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x29051134580>