In [1]:
import os, sys, gc, pickle
sys.path.append('../')
from model.moe import MOE
import preprocess

import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from tqdm import tqdm
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names

import logging
logger = logging.getLogger(__name__)

In [4]:
import importlib
importlib.reload(preprocess)
import preprocess

### 参数

In [2]:
CLS_COLS = ['feed_manu_tag_tfidf_cls_32', 'feed_machine_tag_tfidf_cls_32', 'feed_manu_kw_tfidf_cls_22', 
            'feed_machine_kw_tfidf_cls_17', 'feed_description_tfidf_cls_18', 'author_manu_tag_tfidf_cls_19', 
            'author_machine_tag_tfidf_cls_21', 'author_manu_kw_tfidf_cls_18', 'author_machine_kw_tfidf_cls_18', 
            'author_description_tfidf_cls_18']

TOPIC_COLS = ['feed_manu_tag_topic_class', 'feed_machine_tag_topic_class', 'feed_manu_kw_topic_class', 
              'feed_machine_kw_topic_class', 'feed_description_topic_class', 'author_description_topic_class', 
              'author_manu_kw_topic_class', 'author_machine_kw_topic_class', 'author_manu_tag_topic_class', 
              'author_machine_tag_topic_class']

SPARSE_COLS = ['userid','feedid','authorid','bgm_song_id','bgm_singer_id','videoplayseconds_bin',
               'device','bgm_na']+ CLS_COLS + TOPIC_COLS
DENSE_COLS = ['videoplayseconds','tag_manu_machine_corr']
ACTIONS = ["read_comment","like","click_avatar","forward",'favorite','comment','follow']

USED_COLUMNS = SPARSE_COLS + DENSE_COLS + ACTIONS

DATA_PATH_pri = '../../wbdc2021/data/wedata/wechat_algo_data1/'
DATA_PATH_semi = '../../wbdc2021/data/wedata/wechat_algo_data2/'
DATA_PATH = '../my_data/'
OUTPATH = '../my_data/data_base/'

user_act_pri_path = DATA_PATH_pri + '/user_action.csv'
user_act_semi_path = DATA_PATH_semi + '/user_action.csv'
test_semi_path = DATA_PATH_semi + '/test_a.csv'
feed_path = DATA_PATH + '/feedid_text_features/feed_author_text_features_fillna_by_author_clusters.pkl'

In [3]:
def process_pipe(feed_path, user_act_path, used_columns, used_sparse_cols, used_dense_cols, 
                 emb_dim=16, is_training=True, test_data=False):
    data = preprocess.preprocess(feed_path, user_act_path)
    data_ds = preprocess.down_sample(data, used_columns, sample_method=None, 
                          neg2pos_ratio=300, user_samp='random', 
                          by_date=None, is_training=is_training)
    if(list(data_ds.head(2)['date_'])[0]==15): # test data 
        X_dic, y_arr, linear_feats, dnn_feats, lbe_dict = preprocess.process_features(
                         data_ds, used_sparse_cols, used_dense_cols, 
                         actions=ACTIONS, emb_dim=emb_dim, use_tag_text=None, use_kw_text=None, 
                         feed_history=None, author_history=None,  use_din=False, 
                         max_seq_length=128, behavior_feature_list=['feedid','authorid'],
                         )
        return [(X_dic, y_arr, linear_feats, dnn_feats, lbe_dict)]
    else: # train data
        train_data = data_ds.query('date_<14')
        val_data = data_ds.query('date_==14')
        X_dic_train, y_arr_train, linear_feats, dnn_feats, lbe_dict = preprocess.process_features(
                         train_data, used_sparse_cols, used_dense_cols, 
                         actions=ACTIONS, emb_dim=emb_dim, use_tag_text=None, use_kw_text=None, 
                         feed_history=None, author_history=None,  use_din=False, 
                         max_seq_length=128, behavior_feature_list=['feedid','authorid'],
                         )
        X_dic_val, y_arr_val, linear_feats, dnn_feats, lbe_dict = preprocess.process_features(
                         val_data, used_sparse_cols, used_dense_cols, 
                         actions=ACTIONS, emb_dim=emb_dim, use_tag_text=None, use_kw_text=None, 
                         feed_history=None, author_history=None,  use_din=False, 
                         max_seq_length=128, behavior_feature_list=['feedid','authorid'],
                         )
        return [(X_dic_train, y_arr_train, linear_feats, dnn_feats, lbe_dict),
                (X_dic_val, y_arr_val, linear_feats, dnn_feats, lbe_dict)]

In [7]:
semi_test_train = process_pipe(
    feed_path, test_semi_path, USED_COLUMNS, SPARSE_COLS, DENSE_COLS)

Prepeocessing >>> >>>
feed number: 106444
user actions number:  4252097
total data size:  4252097
Preprocessing Done <<< <<< <<<
~> Memory usage of dataframe is 1216.535 MG
~> Memory usage after optimization is: 231.142 MG
~> Decreased by 81.0%
------------------------------------------------------------


In [8]:
pickle.dump(semi_test_train[0][0], open(f'{OUTPATH}/semi_test_x.pkl', 'wb'))
pickle.dump(semi_test_train[0][2], open(f'{OUTPATH}/linear_feature.pkl', 'wb'))
pickle.dump(semi_test_train[0][3], open(f'{OUTPATH}/dnn_feature.pkl', 'wb'))

In [None]:
semi_train, semi_val = process_pipe(
    feed_path, user_act_semi_path, USED_COLUMNS, SPARSE_COLS, DENSE_COLS)

Prepeocessing >>> >>>
feed number: 106444
user actions number:  73175511
~> Memory usage of dataframe is 6141.133 MG
~> Memory usage after optimization is: 1186.355 MG
~> Decreased by 80.7%
------------------------------------------------------------
raw user_act number:  73175511
user_act_unique number:  71978260
user_act_sum number:  71978260
dropped duplicates user_act numbers:  71978260
total data size:  71978260
Preprocessing Done <<< <<< <<<


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feat] = lbe.transform(df[feat].astype(str))+1


In [None]:
pickle.dump(semi_train[0], open(f'{OUTPATH}/semi_train_x.pkl', 'wb'))
pickle.dump(semi_train[1], open(f'{OUTPATH}/semi_train_y.pkl', 'wb'))

pickle.dump(semi_val[0], open(f'{OUTPATH}/semi_val_x.pkl', 'wb'))
pickle.dump(semi_val[1], open(f'{OUTPATH}/semi_val_y.pkl', 'wb'))

In [13]:
123

123

In [None]:
pri_train, pri_val = process_pipe(
    feed_path, user_act_pri_path, USED_COLUMNS, SPARSE_COLS, DENSE_COLS)

In [None]:
pickle.dump(pri_train[0], open(f'{OUTPATH}/pri_train_x.pkl', 'wb'))
pickle.dump(pri_train[1], open(f'{OUTPATH}/pri_train_y.pkl', 'wb'))

pickle.dump(pri_val[0], open(f'{OUTPATH}/pri_val_x.pkl', 'wb'))
pickle.dump(pri_val[1], open(f'{OUTPATH}/pri_val_y.pkl', 'wb'))

### 采样