In [2]:

import numpy as np
from tqdm import tqdm

def reduce_mem(df, cols):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in tqdm(cols):
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
import warnings
warnings.simplefilter('ignore')
import os

from sklearn.preprocessing import LabelEncoder

import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 500)
from tqdm import tqdm

# from utils import reduce_mem

data_path = '../data'
feat_path = '../data/feat'

df_train = pd.read_pickle(f'{data_path}/train.pkl')
df_test = pd.read_pickle(f'{data_path}/test.pkl')

df_test['date'].value_counts()

df_data = pd.concat([df_train, df_test], sort=False)

# user_info
user_info = pd.read_pickle(f'{data_path}/user_info.pkl')
user_info.head()

df_data = df_data.merge(user_info, how='left')

# doc_info

doc_info = pd.read_pickle(f'{data_path}/doc_info.pkl')
doc_info.head()

all_keywords = set()

def get_all_keyword(x):
    global max_len, min_len
    if x == '':
        return

    splts = x.split(',')

    for sp in splts:
        keyword = sp.split(':')[0]
        all_keywords.add(keyword)


doc_info['keyword'].fillna('', inplace=True)
doc_info['keyword'].apply(get_all_keyword)

keyword2id = dict(zip(all_keywords, range(1, len(all_keywords) + 1)))

def keyword_map(x):
    if x == '':
        return []

    keys = []
    for sp in x.split(','):
        keyword = sp.split(':')[0]
        keys.append(keyword)

    ret = [keyword2id[key] for key in keys]
    return ret


doc_info['keyword'] = doc_info['keyword'].apply(keyword_map)

doc_info['docid'] = doc_info['docid'].astype('int')
df_data = df_data.merge(
    doc_info[['docid', 'category1st', 'category2nd', 'keyword', 'pubtime']], how='left', on='docid')

sparse_features = [
    'userid', 'docid', 'network', 'device', 'os', 'province', 'city', 'age',
    'gender', 'category1st', 'category2nd'
]

for col in tqdm(sparse_features):
    lbe = LabelEncoder()
    df_data[col] = lbe.fit_transform(df_data[col])
    df_data[col] = df_data[col] + 1
    df_data[col].fillna(0, inplace=True)

df_data = reduce_mem(df_data, cols=[f for f in df_data.columns if f not in ['userid', 'docid', 'id', 'dt']])


os.makedirs(f'{feat_path}', exist_ok=True)
df_data.to_pickle(f'{feat_path}/feat_basic.pkl')

100%|██████████| 11/11 [00:00<00:00, 12.71it/s]
100%|██████████| 17/17 [00:00<00:00, 120.96it/s]


145.47 Mb, 76.87 Mb (47.16 %)


In [9]:
print(df_data)
print(df_data.loc[0])

        userid  docid      timestamp  network  refresh  position  click  \
0       247866  40874  1624869556203        4       19     207.0    0.0   
1        35568  51761  1624838347464        4        4      58.0    1.0   
2        53471  18302  1624656823023        4        4       9.0    0.0   
3       129943  55783  1624865763852        4        1      20.0    0.0   
4       275626  55874  1624930276109        4        1      25.0    0.0   
...        ...    ...            ...      ...      ...       ...    ...   
866706  117928  73437  1625036568698        1        1       NaN    NaN   
866707  140882  68813  1625009469794        1        1       NaN    NaN   
866708  229967  74845  1625027566512        4        1       NaN    NaN   
866709    2399  62844  1625034692217        1       11       NaN    NaN   
866710  148875  64410  1625008637470        1        3       NaN    NaN   

        duration                               dt        date       id  \
0            0.0 2021-06-

In [11]:
import warnings
warnings.simplefilter('ignore')
import os

import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 500)
from tqdm import tqdm

# from utils import reduce_mem

feat_path = '../data/feat'

df = pd.read_pickle(f'{feat_path}/feat_basic.pkl')

df.drop(columns=['keyword', 'dt'], inplace=True)

dates = df['date'].unique()
dates.sort()
date_map = dict(zip(dates, range(len(dates))))
df['day'] = df['date'].map(date_map)

for feat in tqdm([['userid'], ['docid'], ['category1st'], ['category2nd'],
                  ['userid', 'category1st'], ['userid', 'category2nd']]):
    res_arr = []
    for d in range(1, max(date_map.values()) + 1):
        df_temp = df[((df['day']) < d)]
        df_temp = df_temp.groupby(feat).size().reset_index()
        df_temp.columns = feat + [f'{"_".join(feat)}_history_count']
        df_temp['day'] = d
        res_arr.append(df_temp)
    stat_df = pd.concat(res_arr)

    df = df.merge(stat_df, how='left', on=feat + ['day'])

# 目标转化率
target = 'click'
for gp in tqdm([['userid'], ['docid'], ['category1st'], ['category2nd'],
                ['userid', 'category1st'], ['userid', 'category2nd']]):
    res_arr = []
    name = f"{'_'.join(gp)}_ctr"
    
    for d in range(1, max(date_map.values()) + 1):
        temp = df[((df['day']) < d)]
        temp = temp.groupby(gp)[target].agg([(name, 'mean')]).reset_index()
        temp['day'] = d
        res_arr.append(temp)
    stat_df = pd.concat(res_arr)

    df = df.merge(stat_df, how='left', on=gp + ['day'])


target = 'duration'
for gp in tqdm([['userid'], ['docid'], ['category1st'], ['category2nd'],
                ['userid', 'category1st'], ['userid', 'category2nd']]):
    res_arr = []
    name_mean = f"{'_'.join(gp)}_history_duration_mean"
    name_std = f"{'_'.join(gp)}_history_duration_std"
    
    for d in range(1, max(date_map.values()) + 1):
        temp = df[((df['day']) < d)]
        temp = temp.groupby(gp)[target].agg([(name_mean, 'mean'), (name_std, 'std')]).reset_index()
        temp['day'] = d
        res_arr.append(temp)
    stat_df = pd.concat(res_arr)

    df = df.merge(stat_df, how='left', on=gp + ['day'])
    
df = reduce_mem(df, cols=[f for f in df.columns if f not in ['userid', 'docid', 'id', 'dt']])

os.makedirs(f'{feat_path}', exist_ok=True)
df.to_pickle(f'{feat_path}/feat_basic_history_all.pkl')

100%|██████████| 6/6 [00:07<00:00,  1.18s/it]
100%|██████████| 6/6 [00:08<00:00,  1.36s/it]
100%|██████████| 6/6 [00:11<00:00,  1.87s/it]
100%|██████████| 41/41 [00:00<00:00, 46.82it/s]


169.44 Mb, 105.80 Mb (37.56 %)


In [12]:
print(df)
print(df.loc[0])

        userid  docid      timestamp  network  refresh  position  click  \
0       247866  40874  1624869556203        4       19     207.0    0.0   
1        35568  51761  1624838347464        4        4      58.0    1.0   
2        53471  18302  1624656823023        4        4       9.0    0.0   
3       129943  55783  1624865763852        4        1      20.0    0.0   
4       275626  55874  1624930276109        4        1      25.0    0.0   
...        ...    ...            ...      ...      ...       ...    ...   
866706  117928  73437  1625036568698        1        1       NaN    NaN   
866707  140882  68813  1625009469794        1        1       NaN    NaN   
866708  229967  74845  1625027566512        4        1       NaN    NaN   
866709    2399  62844  1625034692217        1       11       NaN    NaN   
866710  148875  64410  1625008637470        1        3       NaN    NaN   

        duration        date       id  device  os  province  city  age  \
0            0.0  2021-06

In [13]:
import warnings
warnings.simplefilter('ignore')
import os

import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 500)
from tqdm import tqdm

# from utils import reduce_mem

feat_path = '../data/feat'


df = pd.read_pickle(os.path.join(feat_path, 'feat_basic.pkl'))

df.drop(columns=['keyword', 'dt'], inplace=True)

df.head()

for f in tqdm(['userid', 'docid', 'category1st', 'category2nd']):
    df[f + '_count'] = df[f].map(df[f].value_counts())

for f in tqdm([['userid', 'category1st'], ['userid', 'category2nd']]):
    df_temp = df.groupby(f).size().reset_index().rename(
        columns={0: f'{"_".join(f)}_count'})
    df = df.merge(df_temp, how='left', on=f)

df = reduce_mem(df, cols=[f for f in df.columns if f not in ['userid', 'docid', 'id', 'dt']])


os.makedirs(f'{feat_path}', exist_ok=True)
df.to_pickle(f'{feat_path}/feat_global_statis.pkl')

100%|██████████| 4/4 [00:00<00:00, 28.12it/s]
100%|██████████| 2/2 [00:01<00:00,  1.24it/s]
100%|██████████| 22/22 [00:00<00:00, 147.36it/s]


103.32 Mb, 75.22 Mb (27.20 %)


In [14]:
print(df)
print(df.loc[0])

        userid  docid      timestamp  network  refresh  position  click  \
0       247866  40874  1624869556203        4       19     207.0    0.0   
1        35568  51761  1624838347464        4        4      58.0    1.0   
2        53471  18302  1624656823023        4        4       9.0    0.0   
3       129943  55783  1624865763852        4        1      20.0    0.0   
4       275626  55874  1624930276109        4        1      25.0    0.0   
...        ...    ...            ...      ...      ...       ...    ...   
866706  117928  73437  1625036568698        1        1       NaN    NaN   
866707  140882  68813  1625009469794        1        1       NaN    NaN   
866708  229967  74845  1625027566512        4        1       NaN    NaN   
866709    2399  62844  1625034692217        1       11       NaN    NaN   
866710  148875  64410  1625008637470        1        3       NaN    NaN   

        duration        date       id  device  os  province  city  age  \
0            0.0  2021-06

In [15]:
import warnings
warnings.simplefilter('ignore')
import pickle
import os
import gc
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 500)
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
# from utils import reduce_mem

data_path = '../data'
feat_path = '../data/feat'

load_feat = ['feat_basic', 'feat_basic_history_all', 'feat_global_statis']


feat_list = []
for feat in tqdm(load_feat):
    df = pd.read_pickle(f'{feat_path}/{feat}.pkl')
    feat_list.append(df)


# 检查每个df的顺序是否一致
for i in range(len(load_feat)):
    if i == 0:
        continue

    for f in ['userid', 'docid', 'date']:
        if f not in feat_list[i]:
            continue
            
        if not (feat_list[0][f] == feat_list[i][f]).all():
            print(f'{load_feat[i]}的{f}顺序不一致')

# 删除重复列
all_columns = feat_list[0].columns.tolist()

for i in range(len(load_feat)):
    if i == 0:
        continue

    raw_columns = feat_list[i].columns.tolist()
    drop_columns = list(set(all_columns) & set(raw_columns))

    feat_list[i].drop(columns=drop_columns, inplace=True)

    columns = feat_list[i].columns.tolist()
    all_columns += columns


df_data = pd.concat(feat_list, axis=1)

del feat_list
gc.collect()

sparse_features = [
    'userid', 'docid', 'network', 'device', 'os', 'province', 'city', 'age',
    'gender', 'category1st', 'category2nd'
]
dense_features = [
    'refresh', 'userid_history_count', 'docid_history_count',
    'category1st_history_count', 'category2nd_history_count',
    'userid_category1st_history_count', 'userid_category2nd_history_count',
    'userid_ctr', 'docid_ctr', 'category1st_ctr', 'category2nd_ctr',
    'userid_category1st_ctr', 'userid_category2nd_ctr', 'userid_history_duration_mean',
 'docid_history_duration_mean',
 'category1st_history_duration_mean',
 'category2nd_history_duration_mean',
 'userid_category1st_history_duration_mean',
 'userid_category2nd_history_duration_mean']+['userid_history_duration_std',
 'docid_history_duration_std',
 'category1st_history_duration_std',
 'category2nd_history_duration_std',
 'userid_category1st_history_duration_std',
 'userid_category2nd_history_duration_std','userid_count',
 'docid_count',
 'category1st_count',
 'category2nd_count',
 'userid_category1st_count',
 'userid_category2nd_count']


for col in tqdm(sparse_features):
    lbe = LabelEncoder()
    df_data[col] = lbe.fit_transform(df_data[col])
    df_data[col] = df_data[col] + 1
    df_data[col].fillna(0, inplace=True)

for col in tqdm(dense_features):
    df_data[col] = (df_data[col] - df_data[col].min()) / (df_data[col].max() - df_data[col].min())
    df_data[col].fillna(0, inplace=True)


# df_data = reduce_mem(df_data, cols=[f for f in df_data.columns if f not in ['userid', 'docid', 'id', 'dt']])


df_data.drop(columns=['timestamp', 'dt', 'pubtime', 'day'], inplace=True)

os.makedirs(f'{data_path}', exist_ok=True)
df_data.to_pickle(f'{data_path}/feature.pkl')


with open(os.path.join(data_path, 'dense_features.pkl'), 'wb') as f:
    pickle.dump(dense_features, f)

100%|██████████| 3/3 [00:00<00:00,  4.23it/s]
100%|██████████| 11/11 [00:00<00:00, 15.28it/s]
100%|██████████| 31/31 [00:02<00:00, 12.76it/s]


In [16]:
print(df_data)
print(df_data.loc[0])

        userid  docid  network   refresh  position  click  duration  \
0       247866  40874        4  0.030995     207.0    0.0       0.0   
1        35568  51761        4  0.006525      58.0    1.0     147.0   
2        53471  18302        4  0.006525       9.0    0.0       0.0   
3       129943  55783        4  0.001631      20.0    0.0       0.0   
4       275626  55874        4  0.001631      25.0    0.0       0.0   
...        ...    ...      ...       ...       ...    ...       ...   
866706  117928  73437        1  0.001631       NaN    NaN       NaN   
866707  140882  68813        1  0.001631       NaN    NaN       NaN   
866708  229967  74845        4  0.001631       NaN    NaN       NaN   
866709    2399  62844        1  0.017945       NaN    NaN       NaN   
866710  148875  64410        1  0.004894       NaN    NaN       NaN   

              date       id  device  os  province  city  age  gender  \
0       2021-06-28      NaN    1194   1       103   212    1       3   
1  

In [22]:
print(dense_features)
print(dense_features[0])

['refresh', 'userid_history_count', 'docid_history_count', 'category1st_history_count', 'category2nd_history_count', 'userid_category1st_history_count', 'userid_category2nd_history_count', 'userid_ctr', 'docid_ctr', 'category1st_ctr', 'category2nd_ctr', 'userid_category1st_ctr', 'userid_category2nd_ctr', 'userid_history_duration_mean', 'docid_history_duration_mean', 'category1st_history_duration_mean', 'category2nd_history_duration_mean', 'userid_category1st_history_duration_mean', 'userid_category2nd_history_duration_mean', 'userid_history_duration_std', 'docid_history_duration_std', 'category1st_history_duration_std', 'category2nd_history_duration_std', 'userid_category1st_history_duration_std', 'userid_category2nd_history_duration_std', 'userid_count', 'docid_count', 'category1st_count', 'category2nd_count', 'userid_category1st_count', 'userid_category2nd_count']
refresh
