In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import gensim

### 去除出现频次低的id类特征

In [2]:
df_log = pd.read_csv('train_preliminary/click_log.csv')
df_log.head()

Unnamed: 0,time,user_id,creative_id,click_times
0,9,30920,567330,1
1,65,30920,3072255,1
2,56,30920,2361327,1
3,6,309204,325532,1
4,59,309204,2746730,1


In [3]:
df_log.click_times.value_counts()

1      28426543
2       1507449
3         90356
4         43047
5          7507
6          3512
7          1529
8          1486
9           452
10          270
11          172
12          116
13           76
14           62
15           36
16           34
17           22
18           21
21           17
20           13
19           10
22            5
23            4
26            3
32            3
27            3
30            3
24            3
25            2
29            2
33            2
37            2
28            1
67            1
34            1
35            1
36            1
43            1
44            1
50            1
152           1
Name: click_times, dtype: int64

In [4]:
# 去除异常数据
df_log = df_log[df_log.click_times < 10]

In [5]:
df_log.shape

(30081881, 4)

In [104]:
df_user = pd.read_csv('train_preliminary/user.csv')
df_ad = pd.read_csv('train_preliminary/ad.csv')

In [106]:
# 缺失值先映射到10w
def process_nan(x):
    try:
        x = int(x)
    except:
        x = 100000
    return x

In [107]:
df_user['age'] = df_user['age'].apply(lambda x: x-1)
df_user['gender'] = df_user['gender'].apply(lambda x: x-1)
df_ad['product_id'] = df_ad['product_id'].apply(process_nan)
df_ad['industry'] = df_ad['industry'].apply(process_nan)

In [108]:
df = pd.merge(df_user, df_log, on='user_id')
df = pd.merge(df, df_ad, how='left', on='creative_id')
df.head()

Unnamed: 0,user_id,age,gender,time,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,1,3,0,43,71691,1,66210,100000,18,14681,326
1,1,3,0,20,821396,1,724607,100000,5,7293,326
2,1,3,0,20,209778,1,188507,136,2,9702,6
3,1,3,0,20,877468,1,773445,100000,5,29455,106
4,1,3,0,64,3068256,1,2638858,1454,2,23575,238


In [109]:
## 只用 creative_id 特征试试
creative_id_vc = df.creative_id.value_counts()

In [110]:
creative_id_vc.quantile([0.1, 0.2, 0.3, 0.4, 0.6, 0.8])

0.1    1.0
0.2    1.0
0.3    1.0
0.4    1.0
0.6    2.0
0.8    6.0
Name: creative_id, dtype: float64

In [111]:
df.user_id.nunique()

900000

In [112]:
# 把 creative_id 出现频次小于等于10的过滤掉
creative_id_list = creative_id_vc[creative_id_vc > 10].index
df[df.creative_id.isin(creative_id_list)]['user_id'].nunique()

899993

In [23]:
# 判断这种策略会导致测试集中缺少多少用户
test_log = pd.read_csv('test/click_log.csv')
test_ad = pd.read_csv('test/ad.csv')
test_df = pd.merge(test_log, test_ad, how='left', on='creative_id')
test_df.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,20,3131989,645764,1,573314,58,2,14689,6
1,20,3131989,1027422,1,902764,129,2,42272,6
2,20,3131989,1106443,1,970829,2171,2,37513,322
3,20,3131989,629802,1,559183,\N,18,14678,26
4,59,3131989,2839769,1,2441288,129,2,35328,6


In [26]:
test_df.user_id.nunique(), test_df[test_df.creative_id.isin(creative_id_list)]['user_id'].nunique()

(1000000, 999995)

In [113]:
df = df[df.creative_id.isin(creative_id_list)]
df.head()

Unnamed: 0,user_id,age,gender,time,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,1,3,0,43,71691,1,66210,100000,18,14681,326
2,1,3,0,20,209778,1,188507,136,2,9702,6
3,1,3,0,20,877468,1,773445,100000,5,29455,106
4,1,3,0,64,3068256,1,2638858,1454,2,23575,238
5,1,3,0,39,1683713,1,1458878,100000,5,14668,326


In [114]:
# 字典 重新映射, 0保留作为pad
vocab = dict()
columns = ['creative_id', 'product_id', 'product_category', 'advertiser_id', 'industry']
for col in columns:
    unique = df[col].unique()
    vocab[col] = dict(zip(unique, range(1, len(unique)+1)))

In [115]:
vocab['product_category'][15]

13

In [116]:
# import json
# vocab_str = {}
# for col in vocab.keys():
#     vocab_str[col] = {str(k): v for k, v in vocab[col].items()}
# with open('vocab.txt', 'w') as f:
#     f.write(json.dumps(vocab_str))

In [117]:
# 映射
for col in columns:
    df[col] = df[col].apply(lambda x: vocab[col][x])
df.head()

Unnamed: 0,user_id,age,gender,time,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,1,3,0,43,1,1,66210,1,1,1,1
2,1,3,0,20,2,1,188507,2,2,2,2
3,1,3,0,20,3,1,773445,1,3,3,3
4,1,3,0,64,4,1,2638858,3,2,4,4
5,1,3,0,39,5,1,1458878,1,3,5,1


In [118]:
# 先不关注广告id
df = df.drop(['ad_id', 'click_times'], axis=1)
df.head()

Unnamed: 0,user_id,age,gender,time,creative_id,product_id,product_category,advertiser_id,industry
0,1,3,0,43,1,1,1,1,1
2,1,3,0,20,2,2,2,2,2
3,1,3,0,20,3,1,3,3,3
4,1,3,0,64,4,3,2,4,4
5,1,3,0,39,5,1,3,5,1


In [119]:
# word2vec
df.sort_values(by=['user_id', 'time'], inplace=True)
df.head()

Unnamed: 0,user_id,age,gender,time,creative_id,product_id,product_category,advertiser_id,industry
2,1,3,0,20,2,2,2,2,2
3,1,3,0,20,3,1,3,3,3
5,1,3,0,39,5,1,3,5,1
12,1,3,0,40,11,6,2,11,9
0,1,3,0,43,1,1,1,1,1


In [120]:
for uid, df_user in tqdm(df.head().groupby('user_id')):
    values = df_user.values
    print(values)
    line = '_'.join([str(uid), str(values[0,1]), str(values[0, 2])]) + '\t' + '\t'.join(['_'.join(map(str, line[4:])) for line in values])
    print(line)

100%|██████████| 1/1 [00:00<00:00, 184.50it/s]

[[ 1  3  0 20  2  2  2  2  2]
 [ 1  3  0 20  3  1  3  3  3]
 [ 1  3  0 39  5  1  3  5  1]
 [ 1  3  0 40 11  6  2 11  9]
 [ 1  3  0 43  1  1  1  1  1]]
1_3_0	2_2_2_2_2	3_1_3_3_3	5_1_3_5_1	11_6_2_11_9	1_1_1_1_1





In [84]:
def flat(group):
    datas = []
    for uid, df_user in tqdm(group):
        values = df_user.values
        line = '_'.join([str(uid), str(values[0,1]), str(values[0, 2])]) + '\t' + '\t'.join(['_'.join(map(str, line[4:])) for line in values])
        datas.append(line)
    return datas

In [121]:
df_group = df.groupby('user_id')
datas = flat(df_group)

100%|██████████| 899993/899993 [06:49<00:00, 2199.12it/s]


In [122]:
def save(path, datas):
    with open(path, 'w') as f:
        for line in tqdm(datas):
            f.write(line + '\n')

In [123]:
# 训练接测试集划分
import random
random.seed(1)
random.shuffle(datas)
all_users = len(datas)
train_size = int(all_users * 0.7)
val_size = int(all_users * 0.85)
save('./datas/train.csv', datas[:train_size])
save('./datas/val.csv', datas[train_size:val_size])
save('./datas/test.csv', datas[val_size:])

100%|██████████| 629995/629995 [00:02<00:00, 244604.05it/s]
100%|██████████| 134999/134999 [00:00<00:00, 214305.99it/s]
100%|██████████| 134999/134999 [00:00<00:00, 204352.87it/s]


In [124]:
creative_id, product_id, product_category, advertiser_id, industry = [], [], [], [], []
for line in tqdm(datas):
    seqences = line.split('\t')[1:]
    arr = np.array([list(map(int, seq.split('_'))) for seq in seqences])
    creative_id.append(arr[:, 0])
    product_id.append(arr[:, 1])
    product_category.append(arr[:, 2])
    advertiser_id.append(arr[:, 3])
    industry.append(arr[:, 4])

100%|██████████| 899993/899993 [01:16<00:00, 11744.84it/s]


In [125]:
len(creative_id), len(product_id), len(product_category), len(advertiser_id), len(industry)

(899993, 899993, 899993, 899993, 899993)

In [126]:
creative_id_str = [list(map(str, x)) for x in creative_id]
product_id_str = [list(map(str, x)) for x in product_id]
product_category_str = [list(map(str, x)) for x in product_category]
advertiser_id_str = [list(map(str, x)) for x in advertiser_id]
industry_str = [list(map(str, x)) for x in industry]

In [127]:
model_1 = gensim.models.Word2Vec(creative_id_str, min_count=1, size=256, workers=-1, iter=100)
model_2 = gensim.models.Word2Vec(product_id_str, min_count=1, size=256, workers=-1, iter=50)
model_3 = gensim.models.Word2Vec(product_category_str, min_count=1, size=8, workers=-1, iter=50)
model_4 = gensim.models.Word2Vec(advertiser_id_str, min_count=1, size=256, workers=-1, iter=50)
model_5 = gensim.models.Word2Vec(industry_str, min_count=1, size=32, workers=-1, iter=50)

In [128]:
model_1.save('./word2vec/new_creative_id.model')
model_2.save('./word2vec/new_product_id.model')
model_3.save('./word2vec/new_category.model')
model_4.save('./word2vec/new_advertiser_id.model')
model_5.save('./word2vec/new_industry.model')

In [130]:
for col in columns:
    print(col, len(vocab[col]))

creative_id 292981
product_id 6565
product_category 17
advertiser_id 16434
industry 284


In [131]:
for k, v in vocab['creative_id'].items():
    model_1.wv[str(v)]

In [133]:
length = np.array([len(x) for x in creative_id])

In [134]:
# for pad size
np.quantile(length, [0.9, 0.95, 0.98, 0.99, 0.999])

array([ 54.   ,  73.   , 101.   , 124.   , 222.008])

In [138]:
test_log = pd.read_csv('test/click_log.csv')
test_ad = pd.read_csv('test/ad.csv')
# 去除异常数据
test_log = test_log[test_log.click_times < 10]
test_ad['product_id'] = test_ad['product_id'].apply(process_nan)
test_ad['industry'] = test_ad['industry'].apply(process_nan)

In [144]:
test_df = pd.merge(test_log, test_ad, how='left', on='creative_id')
test_df.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,20,3131989,645764,1,573314,58,2,14689,6
1,20,3131989,1027422,1,902764,129,2,42272,6
2,20,3131989,1106443,1,970829,2171,2,37513,322
3,20,3131989,629802,1,559183,100000,18,14678,26
4,59,3131989,2839769,1,2441288,129,2,35328,6


In [145]:
test_df = test_df[test_df.creative_id.isin(creative_id_list)]
test_df.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,20,3131989,645764,1,573314,58,2,14689,6
1,20,3131989,1027422,1,902764,129,2,42272,6
3,20,3131989,629802,1,559183,100000,18,14678,26
5,16,3131989,564033,1,501478,129,2,38382,6
6,17,3131989,103304,1,93874,100000,18,14,23


In [142]:
test_df.shape

(27474658, 9)

In [146]:
# 映射
for col in columns:
    test_df[col] = test_df[col].apply(lambda x: vocab[col][x])
test_df.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,20,3131989,169943,1,573314,42,2,301,2
1,20,3131989,180463,1,902764,11,2,18,2
3,20,3131989,4677,1,559183,1,1,1842,50
5,16,3131989,56882,1,501478,11,2,2546,2
6,17,3131989,11917,1,93874,1,1,682,111


In [147]:
# 先不关注广告id
test_df = test_df.drop(['ad_id', 'click_times'], axis=1)
test_df.sort_values(by=['user_id', 'time'], inplace=True)
test_df.head()

Unnamed: 0,time,user_id,creative_id,product_id,product_category,advertiser_id,industry
31370198,3,3000001,213696,1,1,1756,17
31370205,11,3000001,5546,340,2,63,24
31370206,11,3000001,49685,246,2,658,20
31370203,23,3000001,30888,1967,9,6019,20
31370199,29,3000001,11347,1,1,840,17


In [149]:
test_group = test_df.groupby('user_id')
with open('./datas/online.csv', 'w') as f:
    for uid, df_user in tqdm(test_group):
        values = df_user.values
        line = str(uid) + '\t' + '\t'.join(['_'.join(map(str, line[2:])) for line in values]) + '\n'
        f.write(line)

100%|██████████| 999995/999995 [07:36<00:00, 2191.08it/s]


In [150]:
set(test_log.user_id.unique()) - set(test_df.user_id.unique())

{3054437, 3086425, 3845254, 3912755, 3976473}

In [152]:
df.gender.value_counts()

0    16981615
1     7911364
Name: gender, dtype: int64

In [153]:
df.age.value_counts()

2    5500271
1    4294468
3    4089199
4    3558916
5    2757860
6    1805929
0    1127763
7     861248
8     542956
9     354369
Name: age, dtype: int64