In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
from tqdm import tqdm


import pandas as pd
from pandas.core.frame import DataFrame

device = torch.device('cuda:5') if torch.cuda.is_available() else torch.device('cpu')
nono=['userid_duration_mean','userid_click_mean','docid_click_mean','keyword0_click_mean','refresh_click_mean','docid_duration_mean','refresh_duration_mean']
pp='10 8.0'

In [7]:
class DeepFM(nn.Module):
    def __init__(self, cate_fea_nuniqs, nume_fea_size=0, emb_size=8,
                 hid_dims=[256, 128], num_classes=1, dropout=[0.2, 0.2]):
        """
        cate_fea_nuniqs: 类别特征的唯一值个数列表，也就是每个类别特征的vocab_size所组成的列表
        nume_fea_size: 数值特征的个数，该模型会考虑到输入全为类别型，即没有数值特征的情况
        """
        super().__init__()
        self.cate_fea_size = len(cate_fea_nuniqs)
        self.nume_fea_size = nume_fea_size

        """FM部分"""
        # 一阶
        if self.nume_fea_size != 0:
            self.fm_1st_order_dense = nn.Linear(self.nume_fea_size, 1)  # 数值特征的一阶表示
        self.fm_1st_order_sparse_emb = nn.ModuleList([
            nn.Embedding(voc_size, 1) for voc_size in cate_fea_nuniqs])  # 类别特征的一阶表示

        # 二阶
        self.fm_2nd_order_sparse_emb = nn.ModuleList([
            nn.Embedding(voc_size, emb_size) for voc_size in cate_fea_nuniqs])  # 类别特征的二阶表示

        """DNN部分"""
        self.all_dims = [self.cate_fea_size * emb_size] + hid_dims
        self.dense_linear = nn.Linear(self.nume_fea_size, self.cate_fea_size * emb_size)  # 数值特征的维度变换到FM输出维度一致
        self.relu = nn.ReLU()
        # for DNN
        for i in range(1, len(self.all_dims)):
            setattr(self, 'linear_' + str(i), nn.Linear(self.all_dims[i - 1], self.all_dims[i]))
            setattr(self, 'batchNorm_' + str(i), nn.BatchNorm1d(self.all_dims[i]))
            setattr(self, 'activation_' + str(i), nn.ReLU())
            setattr(self, 'dropout_' + str(i), nn.Dropout(dropout[i - 1]))
        # for output
        self.dnn_linear = nn.Linear(hid_dims[-1], num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, X_sparse, X_dense=None):
        """
        X_sparse: 类别型特征输入  [bs, cate_fea_size]
        X_dense: 数值型特征输入（可能没有）  [bs, dense_fea_size]
        """

        """FM 一阶部分"""
        fm_1st_sparse_res = [emb(X_sparse[:, i].unsqueeze(1)).view(-1, 1)
                             for i, emb in enumerate(self.fm_1st_order_sparse_emb)]
        fm_1st_sparse_res = torch.cat(fm_1st_sparse_res, dim=1)  # [bs, cate_fea_size]
        fm_1st_sparse_res = torch.sum(fm_1st_sparse_res, 1, keepdim=True)  # [bs, 1]

        if X_dense is not None:
            fm_1st_dense_res = self.fm_1st_order_dense(X_dense)
            fm_1st_part = fm_1st_sparse_res + fm_1st_dense_res
        else:
            fm_1st_part = fm_1st_sparse_res  # [bs, 1]

        """FM 二阶部分"""
        fm_2nd_order_res = [emb(X_sparse[:, i].unsqueeze(1)) for i, emb in enumerate(self.fm_2nd_order_sparse_emb)]
        fm_2nd_concat_1d = torch.cat(fm_2nd_order_res, dim=1)  # [bs, n, emb_size]  n为类别型特征个数(cate_fea_size)

        # 先求和再平方
        sum_embed = torch.sum(fm_2nd_concat_1d, 1)  # [bs, emb_size]
        square_sum_embed = sum_embed * sum_embed  # [bs, emb_size]
        # 先平方再求和
        square_embed = fm_2nd_concat_1d * fm_2nd_concat_1d  # [bs, n, emb_size]
        sum_square_embed = torch.sum(square_embed, 1)  # [bs, emb_size]
        # 相减除以2
        sub = square_sum_embed - sum_square_embed
        sub = sub * 0.5  # [bs, emb_size]

        fm_2nd_part = torch.sum(sub, 1, keepdim=True)  # [bs, 1]

        """DNN部分"""
        dnn_out = torch.flatten(fm_2nd_concat_1d, 1)  # [bs, n * emb_size]

        if X_dense is not None:
            dense_out = self.relu(self.dense_linear(X_dense))  # [bs, n * emb_size]
            dnn_out = dnn_out + dense_out  # [bs, n * emb_size]

        for i in range(1, len(self.all_dims)):
            dnn_out = getattr(self, 'linear_' + str(i))(dnn_out)
            dnn_out = getattr(self, 'batchNorm_' + str(i))(dnn_out)
            dnn_out = getattr(self, 'activation_' + str(i))(dnn_out)
            dnn_out = getattr(self, 'dropout_' + str(i))(dnn_out)

        dnn_out = self.dnn_linear(dnn_out)  # [bs, 1]
        out = fm_1st_part + fm_2nd_part + dnn_out  # [bs, 1]
        out = self.sigmoid(out)
        return out

In [8]:
df_test_user_doc=pd.read_pickle('../../data/wj/df_test_user_doc_64_new.pkl')
doc_feat = pd.read_pickle('../../data/wj/doc.pkl')
user_feat = pd.read_pickle('../../data/wj/user.pkl')


sparse_features = ['userid', 'docid', 'network', 'hour', 'device', 'os', 'province',
                   'city', 'age', 'gender', 'category1st', 'category2nd',
                   'pub_date', 'keyword0', 'keyword1', 'keyword2', 'keyword3', 'keyword4']

dense_features0 = ['refresh', 'picnum',
                  'userid_click_mean','userid_click_count' ,'userid_duration_mean' ,'userid_picnum_mean',
                   'docid_click_mean','docid_click_count','docid_duration_mean','docid_picnum_mean',
                    'category1st_click_mean','category1st_click_count','category1st_duration_mean','category1st_picnum_mean',
                    'category2nd_click_mean','category2nd_click_count','category2nd_duration_mean','category2nd_picnum_mean',
                    'keyword0_click_mean','keyword0_click_count','keyword0_duration_mean','keyword0_picnum_mean',
                 'network_click_mean', 'network_click_count', 'network_duration_mean', 
                  'refresh_click_mean', 'refresh_click_count', 'refresh_duration_mean',
                  'device_click_mean', 'device_click_count', 'device_duration_mean', 
                  'os_click_mean', 'os_click_count', 'os_duration_mean', 
                  'province_click_mean', 'province_click_count', 'province_duration_mean', 
                  'city_click_mean', 'city_click_count', 'city_duration_mean', 
                  'age_click_mean', 'age_click_count', 'age_duration_mean', 
                  'gender_click_mean', 'gender_click_count', 'gender_duration_mean'
                 ]

dense_features=[]
for i in dense_features0:
    if i not in nono:
        dense_features.append(i)
print(len(dense_features0),len(dense_features))

cate_fea_nuniqs = []
cate_fea_nuniqs.append(user_feat['userid'].nunique() + 1)
cate_fea_nuniqs.append(doc_feat['docid'].nunique() + 1)
cate_fea_nuniqs.append(6)  # network
cate_fea_nuniqs.append(13)  # hour
cate_fea_nuniqs.append(user_feat['device'].nunique())
cate_fea_nuniqs.append(user_feat['os'].nunique())
cate_fea_nuniqs.append(user_feat['province'].nunique())
cate_fea_nuniqs.append(user_feat['city'].nunique())
cate_fea_nuniqs.append(user_feat['age'].nunique())
cate_fea_nuniqs.append(user_feat['gender'].nunique())
cate_fea_nuniqs.append(doc_feat['category1st'].nunique())
cate_fea_nuniqs.append(doc_feat['category2nd'].nunique())
cate_fea_nuniqs.append(doc_feat['pub_date'].nunique())
keyword_nunique = max(doc_feat['keyword0'].max(), doc_feat['keyword1'].max(), doc_feat['keyword2'].max()
                      , doc_feat['keyword3'].max(), doc_feat['keyword4'].max()) + 1
cate_fea_nuniqs.append(keyword_nunique)
cate_fea_nuniqs.append(keyword_nunique)
cate_fea_nuniqs.append(keyword_nunique)
cate_fea_nuniqs.append(keyword_nunique)
cate_fea_nuniqs.append(keyword_nunique)

46 39


In [9]:
import time
import pandas as pd
def predict(test_df, s_feat, den_feat, model, device,modeln):
    test_dataset = Data.TensorDataset(torch.LongTensor(test_df[s_feat].values),
                                       torch.FloatTensor(test_df[den_feat].values))
    test_loader = Data.DataLoader(dataset=test_dataset, batch_size=4096, shuffle=False)
    with torch.no_grad():
        test_preds = []
        for idx, x in tqdm(enumerate(test_loader)):
            cate_fea, nume_fea = x[0], x[1]
            cate_fea, nume_fea = cate_fea.to(device), nume_fea.to(device)
            pred = model(cate_fea, nume_fea).reshape(-1).data.cpu().numpy().tolist()
            test_preds.extend(pred)
        id_list = list(range(0, len(test_preds)))
        out_dict = {"id": id_list, "pred": test_preds}
        out_df = pd.DataFrame(out_dict)
        out_df.to_csv('../../data/wj/deepfm_result/deepfm_result_user_'+modeln+'.csv', sep=',', header=False, index=False)

In [5]:
modeln='0.7618_4_6_12_18_08_28_40'
model = DeepFM(cate_fea_nuniqs, nume_fea_size=len(dense_features))
model.load_state_dict(torch.load('../../data/wj/deepfm_best/'+pp+'/deepfm_best_'+modeln+'.pth'))

model = model.to(device)
model.eval()  # 把模型转为test模式
predict(df_test_user_doc, sparse_features, dense_features, model, device,modeln)


13it [00:00, 17.05it/s]
