In [1]:
import os, sys, gc, pickle
sys.path.append('../')
import preprocess
from model.moe import MOE
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from tqdm import tqdm
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names

import logging
logger = logging.getLogger(__name__)

In [2]:
W2V_DIR = '../my_data/'

pretrained_models = {
    'sg_ns_64':{
        'feedid': f'{W2V_DIR}/w2v_models_sg_ns_64/feedid_w7_iter10.64d.pkl',
        'official_feed': f'{W2V_DIR}/official_feed_emb.d512.pkl',
        'official_feed_pca': f'{W2V_DIR}/official_feed_emb_pca.d32.pkl',
        'feed_description_tfidf_cls_18':f'{W2V_DIR}/w2v_models_sg_ns_64/feed_description_tfidf_cls_18_w7_iter10.64d.pkl',
        'feed_machine_kw_tfidf_cls_17':f'{W2V_DIR}/w2v_models_sg_ns_64/feed_machine_kw_tfidf_cls_17_w7_iter10.64d.pkl',
        'feed_machine_tag_tfidf_cls_32':f'{W2V_DIR}/w2v_models_sg_ns_64/feed_machine_tag_tfidf_cls_32_w7_iter10.64d.pkl',
        'feed_manu_kw_tfidf_cls_22':f'{W2V_DIR}/w2v_models_sg_ns_64/feed_manu_kw_tfidf_cls_22_w7_iter10.64d.pkl',
        'feed_manu_tag_tfidf_cls_32':f'{W2V_DIR}/w2v_models_sg_ns_64/feed_manu_tag_tfidf_cls_32_w7_iter10.64d.pkl',

        'authorid': f'{W2V_DIR}/w2v_models_sg_ns_64/authorid_w7_iter10.64d.pkl',
        'author_description_tfidf_cls_18':f'{W2V_DIR}/w2v_models_sg_ns_64/author_description_tfidf_cls_18_w7_iter10.64d.pkl',
        'author_machine_kw_tfidf_cls_18':f'{W2V_DIR}/w2v_models_sg_ns_64/author_machine_kw_tfidf_cls_18_w7_iter10.64d.pkl',
        'author_machine_tag_tfidf_cls_21':f'{W2V_DIR}/w2v_models_sg_ns_64/author_machine_tag_tfidf_cls_21_w7_iter10.64d.pkl',
        'author_manu_kw_tfidf_cls_18':f'{W2V_DIR}/w2v_models_sg_ns_64/author_manu_kw_tfidf_cls_18_w7_iter10.64d.pkl',
        'author_manu_tag_tfidf_cls_19':f'{W2V_DIR}/w2v_models_sg_ns_64/author_manu_tag_tfidf_cls_19_w7_iter10.64d.pkl',

        'userid_by_feed': f'{W2V_DIR}/w2v_models_sg_ns_64/userid_by_feedid_w10_iter10.64d.pkl',
        'userid_by_author': f'{W2V_DIR}/w2v_models_sg_ns_64/userid_by_authorid_w10_iter10.64d.pkl',
        'bgm_singer_id': f'{W2V_DIR}/w2v_models_sg_ns_64/bgm_singer_id_w7_iter10.64d.pkl',
        'bgm_song_id': f'{W2V_DIR}/w2v_models_sg_ns_64/bgm_song_id_w7_iter10.64d.pkl',
        'feed_machine_tag': f'{W2V_DIR}/w2v_models_sg_ns_64/feed_machine_tag_emb.64d.pkl',
        'feed_manu_tag': f'{W2V_DIR}/w2v_models_sg_ns_64/feed_manu_tag_emb.64d.pkl',
        'feed_machine_kw': f'{W2V_DIR}/w2v_models_sg_ns_64/machine_kw_w15_iter10.64d.pkl',
        'feed_manu_kw': f'{W2V_DIR}/w2v_models_sg_ns_64/feed_manu_kw_emb.64d.pkl',
    }
}

USED_FEATURES = ['userid','feedid','authorid','bgm_song_id','bgm_singer_id','videoplayseconds_bin','bgm_na',
                 'videoplayseconds','tag_manu_machine_corr']+\
                ['feed_machine_tag_tfidf_cls_32','feed_machine_kw_tfidf_cls_17',
                 'author_machine_tag_tfidf_cls_21','author_machine_kw_tfidf_cls_18']

DATA_PATH = '../my_data/data_base/'
# 全部特征
linear_feature_columns = pickle.load(open(DATA_PATH+'/linear_feature.pkl','rb'))
dnn_feature_columns = pickle.load(open(DATA_PATH+'/dnn_feature.pkl','rb'))
#print('raw:')
#print(dnn_feature_columns)
# 使用其中部分特征
linear_feature_columns = [f for f in linear_feature_columns if f.name in USED_FEATURES]
dnn_feature_columns = [f for f in dnn_feature_columns if f.name in USED_FEATURES]
lbe_dict = preprocess.LBE_MODEL

pri_train_X = pickle.load(open(DATA_PATH+'/pri_train_x.pkl','rb'))
pri_train_y = pickle.load(open(DATA_PATH+'/pri_train_y.pkl','rb'))
pri_val_X = pickle.load(open(DATA_PATH+'/pri_val_x.pkl','rb'))
pri_val_y = pickle.load(open(DATA_PATH+'/pri_val_y.pkl','rb'))

semi_train_X = pickle.load(open(DATA_PATH+'/semi_train_x.pkl','rb'))
semi_train_y = pickle.load(open(DATA_PATH+'/semi_train_y.pkl','rb'))
semi_val_X = pickle.load(open(DATA_PATH+'/semi_val_x.pkl','rb'))
semi_val_y = pickle.load(open(DATA_PATH+'/semi_val_y.pkl','rb'))
# 从数据集中选取部分特征
semi_train_X = {f.name:semi_train_X[f.name] for f in dnn_feature_columns}
semi_val_X = {f.name:semi_val_X[f.name] for f in dnn_feature_columns}
pri_train_X = {f.name:pri_train_X[f.name] for f in dnn_feature_columns}
pri_val_X = {f.name:pri_val_X[f.name] for f in dnn_feature_columns}

for col in semi_train_X:
    semi_train_X[col] = np.concatenate((semi_train_X[col], pri_train_X[col]), axis=0)
semi_train_y = np.concatenate((semi_train_y, pri_train_y), axis=0)

args = {}
args['USED_FEATURES'] = USED_FEATURES
args['DATA_PATH'] = DATA_PATH

global hidden_units
hidden_units = (512,256,128)
args['hidden_units'] = hidden_units
args['batch_size'] = 40000
args['emb_dim'] = dnn_feature_columns[0].embedding_dim
args['learning_rate'] = 0.05
args['lr_scheduler'] = True
args['epochs'] = 2
args['scheduler_epochs'] = 3
args['num_warm_epochs'] = 0
args['scheduler_method'] = 'cos'
args['use_bn'] = True
args['reduction'] = 'sum'
args['optimizer'] = 'adagrad'
args['num_tasks'] = 7
args['early_stop_uauc'] = 0.689
# args['pretrained_embeddings'] = ['userid_by_feed','feedid','authorid','official_feed'] # official_feed_pca, feed_machine_tag, 
                                                                               # feed_manu_tag, feed_machine_kw, feed_manu_kw
args['num_workers'] = 7
args['task_dict'] = {
        0: 'read_comment',
        1: 'like',
        2: 'click_avatar',
        3: 'forward',
        4: 'favorite',
        5: 'comment',
        6: 'follow'
}
args['task_weight'] = {
        0: 1,
        1: 1,
        2: 1,
        3: 1,
        4: 1,
        5: 1,
        6: 1
}
args['opt_iters'] = [10, 10]
args['pbounds'] = {'dropout': (0.0, 0.9),
                   #'learning_rate': 0.001,
                   'l2_reg_dnn': (0.0001,0.0001),
                   'l2_reg_embedding': (0.1, 0.1),
                   'l2_reg_linear': (0.1, 0.1)
                  }

args['pretrained_model'] = pretrained_models['sg_ns_64']

lbe_dict = preprocess.LBE_MODEL
# 载入预训练Embedding weight matrix
user_emb_weight = preprocess.load_feature_pretrained_embedding(lbe_dict['userid'], 
                                                    args['pretrained_model']['userid_by_feed'], padding=True)
author_emb_weight = preprocess.load_feature_pretrained_embedding(lbe_dict['authorid'], 
                                                    args['pretrained_model']['authorid'], padding=True)
feed_emb_weight = preprocess.load_feature_pretrained_embedding(lbe_dict['feedid'], 
                                                    args['pretrained_model']['feedid'], padding=True)
#     feed_machine_tag_weight = preprocess.load_feature_pretrained_embedding(lbe_dict['feedid'], 
#                                                         args['pretrained_model']['feed_machine_tag'], padding=True)
#     feed_manu_tag_weight = preprocess.load_feature_pretrained_embedding(lbe_dict['feedid'], 
#                                                         args['pretrained_model']['feed_manu_tag'], padding=True)
#     feed_machine_kw_weight = preprocess.load_feature_pretrained_embedding(lbe_dict['feedid'], 
#                                                         args['pretrained_model']['feed_machine_kw'], padding=True)
#     feed_manu_kw_weight = preprocess.load_feature_pretrained_embedding(lbe_dict['feedid'], 
#                                                         args['pretrained_model']['feed_manu_kw'], padding=True)
official_feed_weight = preprocess.load_feature_pretrained_embedding(lbe_dict['feedid'], 
                                                    args['pretrained_model']['official_feed'], padding=True)


logger.info('All used features:')
logger.info(semi_train_X.keys())

from bayes_opt import BayesianOptimization

device = 'gpu'
if device=='gpu' and torch.cuda.is_available():
    # print('cuda ready...')
    device = 'cuda:1'
else:
    device = 'cpu'

../my_data//w2v_models_sg_ns_64/userid_by_feedid_w10_iter10.64d.pkl
classes numbers:  219999
word2vec vocab size:  219999
Total Random initialized word embedding counts:  0
../my_data//w2v_models_sg_ns_64/authorid_w7_iter10.64d.pkl
classes numbers:  18789
word2vec vocab size:  18788
Total Random initialized word embedding counts:  1
../my_data//w2v_models_sg_ns_64/feedid_w7_iter10.64d.pkl
classes numbers:  106444
word2vec vocab size:  103864
Total Random initialized word embedding counts:  2580
../my_data//official_feed_emb.d512.pkl
classes numbers:  106444
word2vec vocab size:  106444


07/23/2021 09:55:38 - INFO - __main__ -   All used features:
07/23/2021 09:55:38 - INFO - __main__ -   dict_keys(['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id', 'videoplayseconds_bin', 'bgm_na', 'feed_machine_tag_tfidf_cls_32', 'feed_machine_kw_tfidf_cls_17', 'author_machine_tag_tfidf_cls_21', 'author_machine_kw_tfidf_cls_18', 'videoplayseconds', 'tag_manu_machine_corr'])


Total Random initialized word embedding counts:  0


In [3]:
_moe = MOE(dnn_hidden_units=args['hidden_units'], linear_feature_columns=linear_feature_columns,
          dnn_feature_columns=dnn_feature_columns, task='binary', dnn_dropout=0.,
          l2_reg_embedding=0., l2_reg_dnn=0.,
          l2_reg_linear=0., device=device, seed=1233, num_tasks=args['num_tasks'],
          pretrained_user_emb_weight=None, 
          pretrained_author_emb_weight=None,
          pretrained_feed_emb_weight=None,
          )

train_loader = preprocess.get_dataloader(semi_train_X, _moe, y=semi_train_y, batch_size=args['batch_size'],  
                   num_workers=7)

val_loader = preprocess.get_dataloader(semi_val_X, _moe, y=None, batch_size=args['batch_size'],  
                   num_workers=7)

val_userid_lst = semi_val_X['userid'].tolist()

In [4]:
# params = {'target': 0.6984976583814221, 'params': 
params = {'dropout': 0.08746740041525639, 
          'l2_reg_dnn': 0.0001, 
          'l2_reg_embedding': 0.05032424704698356, 
          'l2_reg_linear': 0.06932797233659868}

In [5]:
np.random.seed(2345)
import random
random.seed(2345)

moe = MOE(dnn_hidden_units=args['hidden_units'], linear_feature_columns=linear_feature_columns,
          dnn_feature_columns=dnn_feature_columns, task='binary', dnn_dropout=params['dropout'],
          l2_reg_embedding=params['l2_reg_embedding'], l2_reg_dnn=params['l2_reg_dnn'],
          l2_reg_linear=params['l2_reg_linear'], device=device, seed=1233, num_tasks=args['num_tasks'],
          pretrained_user_emb_weight=[user_emb_weight],
          pretrained_author_emb_weight=[author_emb_weight],
          pretrained_feed_emb_weight=[feed_emb_weight,official_feed_weight],
          )

moe.compile(optimizer=args['optimizer'], learning_rate=args['learning_rate'], loss="binary_crossentropy", 
              metrics=["binary_crossentropy",'auc','uauc'])

metric = moe.fit(train_loader, validation_data=[val_loader, semi_val_y],
                   epochs=2, val_userid_list=val_userid_lst,
                   lr_scheduler=args['lr_scheduler'], scheduler_epochs=args['scheduler_epochs'],
                   scheduler_method=args['scheduler_method'], num_warm_epochs=args['num_warm_epochs'],
                   reduction=args['reduction'],
                   task_dict=args['task_dict'], task_weight=args['task_weight'],verbose=1,
                   early_stop_uauc=0.55)

07/23/2021 09:55:59 - INFO - model.moe -   Train on 72480000 samples, validate on 6103955 samples, 1812 steps per epoch
145it [01:19,  2.05it/s]

Please check the latest version manually on https://pypi.org/project/deepctr-torch/#history


1812it [27:34,  1.10it/s]
07/23/2021 10:25:58 - INFO - model.moe -   Epoch 1/2 1799s - loss:  0.2665 - read_comment_loss:  0.0909 - like_loss:  0.0908 - click_avatar_loss:  0.0362 - forward_loss:  0.0202 - favorite_loss:  0.0070 - comment_loss:  0.0036 - follow_loss:  0.0054 - val_read_comment_binary_crossentropy:  0.0904 - val_like_binary_crossentropy:  0.0896 - val_click_avatar_binary_crossentropy:  0.0356 - val_forward_binary_crossentropy:  0.0185 - val_favorite_binary_crossentropy:  0.0064 - val_comment_binary_crossentropy:  0.0028 - val_follow_binary_crossentropy:  0.0050 - val_read_comment_auc:  0.9338 - val_like_auc:  0.8583 - val_click_avatar_auc:  0.8758 - val_forward_auc:  0.8968 - val_favorite_auc:  0.9434 - val_comment_auc:  0.9000 - val_follow_auc:  0.8981 - val_read_comment_uauc:  0.6635 - val_like_uauc:  0.6609 - val_click_avatar_uauc:  0.7515 - val_forward_uauc:  0.7417 - val_favorite_uauc:  0.7690 - val_comment_uauc:  0.6262 - val_follow_uauc:  0.7337 - val_UAUC:  0.69

In [None]:
np.random.seed(2345)
import random
random.seed(2345)

moe = MOE(dnn_hidden_units=args['hidden_units'], linear_feature_columns=linear_feature_columns,
          dnn_feature_columns=dnn_feature_columns, task='binary', dnn_dropout=params['dropout'],
          l2_reg_embedding=params['l2_reg_embedding'], l2_reg_dnn=params['l2_reg_dnn'],
          l2_reg_linear=params['l2_reg_linear'], device=device, seed=1233, num_tasks=args['num_tasks'],
          pretrained_user_emb_weight=[user_emb_weight],
          pretrained_author_emb_weight=[author_emb_weight],
          pretrained_feed_emb_weight=[feed_emb_weight,official_feed_weight],
          )

moe.compile(optimizer=args['optimizer'], learning_rate=args['learning_rate'], loss="binary_crossentropy", 
              metrics=["binary_crossentropy",'auc','uauc'])

metric = moe.fit(train_loader, validation_data=[val_loader, semi_val_y],
                   epochs=2, val_userid_list=val_userid_lst,
                   lr_scheduler=args['lr_scheduler'], scheduler_epochs=args['scheduler_epochs'],
                   scheduler_method=args['scheduler_method'], num_warm_epochs=args['num_warm_epochs'],
                   reduction=args['reduction'],
                   task_dict=args['task_dict'], task_weight=args['task_weight'],verbose=1,
                   early_stop_uauc=0.55)

07/23/2021 00:32:11 - INFO - model.moe -   Train on 72480000 samples, validate on 6103955 samples, 1812 steps per epoch
1812it [14:35,  2.07it/s]
07/23/2021 00:48:58 - INFO - model.moe -   Epoch 1/2 1006s - loss:  0.2665 - read_comment_loss:  0.0907 - like_loss:  0.0910 - click_avatar_loss:  0.0363 - forward_loss:  0.0201 - favorite_loss:  0.0070 - comment_loss:  0.0037 - follow_loss:  0.0054 - val_read_comment_binary_crossentropy:  0.0904 - val_like_binary_crossentropy:  0.0897 - val_click_avatar_binary_crossentropy:  0.0357 - val_forward_binary_crossentropy:  0.0185 - val_favorite_binary_crossentropy:  0.0064 - val_comment_binary_crossentropy:  0.0028 - val_follow_binary_crossentropy:  0.0050 - val_read_comment_auc:  0.9338 - val_like_auc:  0.8576 - val_click_avatar_auc:  0.8758 - val_forward_auc:  0.8966 - val_favorite_auc:  0.9457 - val_comment_auc:  0.8971 - val_follow_auc:  0.8981 - val_read_comment_uauc:  0.6658 - val_like_uauc:  0.6604 - val_click_avatar_uauc:  0.7525 - val_for

In [6]:
_moe = MOE(dnn_hidden_units=args['hidden_units'], linear_feature_columns=linear_feature_columns,
          dnn_feature_columns=dnn_feature_columns, task='binary', dnn_dropout=0.,
          l2_reg_embedding=0., l2_reg_dnn=0.,
          l2_reg_linear=0., device=device, seed=1233, num_tasks=args['num_tasks'],
          pretrained_user_emb_weight=None,
          pretrained_author_emb_weight=None,
          pretrained_feed_emb_weight=None,
          )

# 用于线上预测的训练集
online_train_X = {}
for col in semi_train_X:
    online_train_X[col] = np.concatenate((semi_train_X[col], semi_val_X[col], pri_val_X[col]), axis=0)
online_train_y = np.concatenate((semi_train_y, semi_val_y, pri_val_y), axis=0)

online_train_loader = preprocess.get_dataloader(online_train_X, _moe, y=online_train_y, 
                                              batch_size=args['batch_size'],  
                                              num_workers=7)

In [7]:
np.random.seed(2345)
import random
random.seed(2345)

moe = MOE(dnn_hidden_units=args['hidden_units'], linear_feature_columns=linear_feature_columns,
          dnn_feature_columns=dnn_feature_columns, task='binary', dnn_dropout=params['dropout'],
          l2_reg_embedding=params['l2_reg_embedding'], l2_reg_dnn=params['l2_reg_dnn'],
          l2_reg_linear=params['l2_reg_linear'], device=device, seed=1233, num_tasks=args['num_tasks'],
          pretrained_user_emb_weight=[user_emb_weight],
          pretrained_author_emb_weight=[author_emb_weight],
          pretrained_feed_emb_weight=[feed_emb_weight,official_feed_weight],
          )

moe.compile(optimizer=args['optimizer'], learning_rate=args['learning_rate'], loss="binary_crossentropy", 
              metrics=["binary_crossentropy",'auc','uauc'])

metric = moe.fit(online_train_loader, validation_data=None,
                   epochs=2, val_userid_list=None,
                   lr_scheduler=args['lr_scheduler'], scheduler_epochs=args['scheduler_epochs'],
                   scheduler_method=args['scheduler_method'], num_warm_epochs=args['num_warm_epochs'],
                   reduction=args['reduction'], 
                   task_dict=args['task_dict'], task_weight=args['task_weight'],verbose=1,
                   early_stop_uauc=0.55)

07/23/2021 10:58:04 - INFO - model.moe -   Train on 79200000 samples, validate on 0 samples, 1980 steps per epoch
1980it [29:58,  1.10it/s]
07/23/2021 11:28:13 - INFO - model.moe -   Epoch 1/2 1808s - loss:  0.2652 - read_comment_loss:  0.0904 - like_loss:  0.0907 - click_avatar_loss:  0.0361 - forward_loss:  0.0199 - favorite_loss:  0.0069 - comment_loss:  0.0035 - follow_loss:  0.0053
1980it [30:00,  1.10it/s]
07/23/2021 11:58:24 - INFO - model.moe -   Epoch 2/2 1811s - loss:  0.2423 - read_comment_loss:  0.0847 - like_loss:  0.0861 - click_avatar_loss:  0.0331 - forward_loss:  0.0178 - favorite_loss:  0.0055 - comment_loss:  0.0025 - follow_loss:  0.0044


In [8]:
# 测试集
semi_test_X = pickle.load(open(DATA_PATH+'/semi_test_x.pkl','rb'))
semi_test_X = {f.name:semi_test_X[f.name] for f in dnn_feature_columns}

online_test_loader = preprocess.get_dataloader(semi_test_X, moe, y=None,
                                              batch_size=args['batch_size'],
                                              num_workers=7)

In [9]:
pred_arr = moe.predict(online_test_loader)

In [10]:
test_sub = pd.read_csv('/home/tione/notebook/wbdc2021/data/wedata/wechat_algo_data2/test_a.csv',
                       header=0)
df_res = pd.DataFrame(pred_arr)
df_res.columns = ["read_comment","like","click_avatar","forward",'favorite','comment','follow']

test_sub = pd.concat([test_sub, df_res], axis=1)
test_sub.loc[test_sub.device==1, 'read_comment'] = 0

test_sub[['userid','feedid',"read_comment","like","click_avatar","forward",'favorite','comment','follow']]\
    .to_csv('results/pri_semi_all.lr0.05.s0.69866.csv', header=True, index=False)