In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib

def sparse_faeture_dict(feat_name, feat_num, embed_dim=4):
    """
    为每个离散变量建立信息字典
    :param feat_name: 特征名。
    :param feat_num: 特征数，每一个特征编码后对应有多少个类别。
    :param embed_dim: 特征维度，特征embedding后的维度。
    :return:
    """
    return {"feat_name": feat_name, "feat_num": feat_num, "embed_dim": embed_dim}

def create_avazu_dataset(path, read_part=True, samples_num=5000, embed_dim=8):
    print('数据预处理开始')
    sparse_features = ['hour', 'id', 'C1', 'banner_pos', 'site_id', 'site_domain',
                       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
                       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
                       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

    train_path = path + '/train.gz'
    test_path = path + '/test.gz'
    print('加载数据集')
    if read_part:
        train_data = pd.read_csv(train_path, nrows=samples_num)
    else:
        train_data = pd.read_csv(train_path)
    test_x = pd.read_csv(test_path)

    # hour, 只有14年10月11天的数据，year,month没必要做特征
    train_data['hour'] = train_data['hour'].apply(str)
    train_data['hour'] = train_data['hour'].map(lambda x: int(x[6:8]))  # int强转去掉字符串前的0
    test_x['hour'] = test_x['hour'].apply(str)
    test_x['hour'] = test_x['hour'].map(lambda x: int(x[6:8]))
    print('加载数据完成')
    print('Sparse feature encode')
    # sparse features
    le = LabelEncoder()
    for feat in sparse_features:
        all_class = pd.concat([train_data[feat], test_x[feat]]).unique()
        le.fit(all_class)
        train_data[feat] = le.transform(train_data[feat])
        test_x[feat] = le.transform(test_x[feat])

    print('Sparse feature encode succeed')
    # save LabelEncoder model for test
    # joblib.dump(le, 'label_encoder.model')
    # sparse_faeture_dict(feat_name='day', feat_num=32, embed_dim=embed_dim)
    # sparse_faeture_dict(feat_name='hour', feat_num=24, embed_dim=embed_dim)
    features_columns = [sparse_faeture_dict(feat_name=feat, feat_num=train_data[feat].max() + 1, embed_dim=embed_dim)
                        for feat in sparse_features]

    train, val = train_test_split(train_data, test_size=0.2, shuffle=True)
    train_x = train[sparse_features].values.astype('int32')
    train_y = train['click'].values.astype('int32')
    val_x = val[sparse_features].values.astype('int32')
    val_y = val['click'].values.astype('int32')
    test_x = test_x[sparse_features].values.astype('int32')

    print('数据预处理完成')
    return (train_x, train_y), (val_x, val_y), test_x, features_columns

In [None]:
from tensorflow.keras.layers import Layer


class KMaxPooling(Layer):
    def __init__(self, k, dim):
        super(KMaxPooling, self).__init__()
        self.k = k
        self.dim = dim

    def forward(self, X):
        index = X.topk(self.k, dim=self.dim)[1].sort(dim=self.dim)[0]
        output = X.gather(self.dim, index)
        return output
    
from tensorflow.keras.regularizers import l2
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Conv2D, ZeroPadding2D, Dense, Flatten
from tensorflow.keras.initializers import RandomNormal
import tensorflow as tf
from tensorflow.keras.activations import relu, softmax, sigmoid


class CCPM(Model):
    """
    """

    def __init__(self, feat_column, conv_kernel_width=(6, 5, 3), conv_filters=(4, 4, 4), embed_reg=1e-6):
        super(CCPM, self).__init__()
        self.sparse_feat = feat_column
        self.sparse_feat_len = len(self.sparse_feat)
        self.conv_len = len(conv_filters)  # 卷积层数

        self.embedding_list = [
            Embedding(input_dim=feat['feat_num'], output_dim=feat['embed_dim'],
                      embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                      embeddings_regularizer=l2(embed_reg), input_length=1)
            for feat in self.sparse_feat]

        # KMaxPooling
        self.p = []
        for i in range(1, self.conv_len + 1):
            if i < self.conv_len:
                k = max(1, int((1 - pow(i / self.conv_len, self.conv_len - i)) * self.sparse_feat_len))
                self.p.append(k)
            else:
                self.p.append(3)
        self.max_pooling_list = [KMaxPooling(k, dim=2) for k in self.p]

        self.padding_list = [ZeroPadding2D(padding=(0, conv_kernel_width[i] - 1))
                             for i in range(self.conv_len)]
        self.conv_list = [Conv2D(filters=conv_filters[i], kernel_size=(1, conv_kernel_width[i]))
                          for i in range(self.conv_len)]

        self.flatten = Flatten()
        self.dense = Dense(units=1)

    def call(self, inputs, training=None, mask=None):
        # batch,feat_num
        sparse_feat = inputs
        # batch,embed_dim,feat_num
        s = tf.stack([self.embedding_list[i](sparse_feat[:, i]) for i in range(self.sparse_feat_len)], axis=-1)
        # 先扩充channel维度
        s = tf.expand_dims(s, axis=3)

        for i in range(self.conv_len):
            # padding
            s = self.padding_list[i](s)
            # conv , batch,embed_dim,width,channel
            r = self.conv_list[i](s)
            s = self.max_pooling_list[i](r)
            s = relu(s)

        outputs = self.dense(self.flatten(s))
        outputs = sigmoid(outputs)
        return outputs


In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping

path = '../input/avazu-ctr-prediction/'

embed_dim = 11

learning_rate = 0.001
batch_size = 128
epochs = 20

(train_x, train_y), (val_x, val_y), test_x, features_columns = create_avazu_dataset(path, read_part=True, samples_num=10000000,
                                                                            embed_dim=embed_dim)

model = CCPM(feat_column=features_columns)
model.compile(optimizer=Adam(), loss=binary_crossentropy)
model.fit(x=train_x, y=train_y, batch_size=batch_size, epochs=epochs, validation_split=0.1, shuffle=True,callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)])
result = model.predict(test_x,batch_size=10000)
pd.Series(result.flatten()).to_csv('result.csv',header=None,index=False)
