In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import gensim
from tqdm import tqdm
from collections import Counter

In [4]:
# 训练数据merge
df_log = pd.read_csv('train_preliminary/click_log.csv')
# 去除异常数据
df_log = df_log[df_log.click_times < 10]
df_user = pd.read_csv('train_preliminary/user.csv')
# label 从 0 开始
df_user['age'] = df_user['age'].apply(lambda x: x-1)
df_user['gender'] = df_user['gender'].apply(lambda x: x-1)
df_train = pd.merge(df_user, df_log, on='user_id')
df_train.head()

Unnamed: 0,user_id,age,gender,time,creative_id,click_times
0,1,3,0,43,71691,1
1,1,3,0,20,821396,1
2,1,3,0,20,209778,1
3,1,3,0,20,877468,1
4,1,3,0,64,3068256,1


In [5]:
df_ad = pd.read_csv('train_preliminary/ad.csv')
df_train = pd.merge(df_train, df_ad, how='left', on='creative_id')
df_train.head()

Unnamed: 0,user_id,age,gender,time,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,1,3,0,43,71691,1,66210,\N,18,14681,326
1,1,3,0,20,821396,1,724607,\N,5,7293,326
2,1,3,0,20,209778,1,188507,136,2,9702,6
3,1,3,0,20,877468,1,773445,\N,5,29455,106
4,1,3,0,64,3068256,1,2638858,1454,2,23575,238


In [6]:
# 测试数据
test_log = pd.read_csv('test/click_log.csv')
test_ad = pd.read_csv('test/ad.csv')
test_log = test_log[test_log.click_times < 10]
df_test = pd.merge(test_log, test_ad, how='left', on='creative_id')
df_test.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,20,3131989,645764,1,573314,58,2,14689,6
1,20,3131989,1027422,1,902764,129,2,42272,6
2,20,3131989,1106443,1,970829,2171,2,37513,322
3,20,3131989,629802,1,559183,\N,18,14678,26
4,59,3131989,2839769,1,2441288,129,2,35328,6


In [7]:
counter = Counter()
counter.update(df_train.ad_id.values)
counter.update(df_test.ad_id.values)

In [8]:
# 构建词表 去除低频词
ad_id_count = np.array([v for k, v in counter.items()])
np.quantile(ad_id_count, [0.01, 0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9])

array([ 1.,  1.,  1.,  1.,  2.,  3.,  5.,  9., 21.])

In [9]:
# 阈值20的creative_id数
(ad_id_count >= 20).sum()

322514

In [10]:
# 构建vocab  0 for pad, 1 for unk
words = [k for k, v in counter.items() if v >= 20]
vocab = dict(zip(words, range(2, len(words) + 2)))
len(vocab)

322514

In [11]:
# 过滤低频词
print(df_train.shape, df_test.shape)
df_train = df_train[df_train.ad_id.isin(vocab)]
df_test = df_test[df_test.ad_id.isin(vocab)]
print(df_train.shape, df_test.shape)

(30081881, 11) (33582799, 9)
(25402681, 11) (28331365, 9)


In [12]:
df_train.head()

Unnamed: 0,user_id,age,gender,time,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,1,3,0,43,71691,1,66210,\N,18,14681,326
2,1,3,0,20,209778,1,188507,136,2,9702,6
3,1,3,0,20,877468,1,773445,\N,5,29455,106
4,1,3,0,64,3068256,1,2638858,1454,2,23575,238
5,1,3,0,39,1683713,1,1458878,\N,5,14668,326


In [33]:
# # 排序
# df_train.sort_values(by=['user_id', 'time'], inplace=True)
# df_test.sort_values(by=['user_id', 'time'], inplace=True)

In [38]:
# 保存 
# train_group = df_train.groupby('user_id')
# train_datas = []
# for uid, df_user in tqdm(train_group):
#     age = df_user.age.values[0]
#     gender = df_user.gender.values[0]
#     ads = [vocab[k] for k in df_user.ad_id.values]
#     line = ' '.join(map(str, [uid, age, gender] + ads))
#     train_datas.append(line)

100%|██████████| 899996/899996 [07:47<00:00, 1924.47it/s]


In [39]:
# test_datas
# test_group = df_test.groupby('user_id')
# test_datas = []
# for uid, df_user in tqdm(test_group):
#     ads = [vocab[k] for k in df_user.ad_id.values]
#     line = str(uid) + ' ' + ' '.join(map(str, ads))
#     test_datas.append(line)

100%|██████████| 999996/999996 [06:48<00:00, 2448.08it/s]


In [40]:
# def save(path, datas):
#     with open(path, 'w') as f:
#         for line in tqdm(datas):
#             f.write(line + '\n')

In [41]:
# 训练集测试集划分
# import random
# random.seed(1)
# random.shuffle(train_datas)
# all_users = len(train_datas)
# train_size = int(all_users * 0.8)
# save('./datas/ad_train.csv', train_datas[:train_size])
# save('./datas/ad_val.csv', train_datas[train_size: ])

100%|██████████| 719996/719996 [00:01<00:00, 487774.88it/s]
100%|██████████| 180000/180000 [00:00<00:00, 501635.00it/s]


In [44]:
# train_datas[:3]

['293519 5 1 104566 7261 220169 71919 57331 308980 3040 40666 40666 25469 56907 4484 4705 1493 26421 94125 12883 1718 7738 105429 6681 36208 1160 21027 268944 125601 101723 2841 31687 111487 43311 2827 100821',
 '826739 2 1 142039 209971 154246 29933 125512 9122 36668 72190 145309',
 '546131 1 1 10625 309863 128790 227920 17828 7202 1894 82111 187108 24808 56208 2078 484 1400 7346 159667 59915 188091 130333 27314 31206 86693 172197 26786 35983 34441']

In [45]:
# 训练数据和测试数据联合word2vec
# sentences = [line.split(' ')[3:] for line in train_datas] + [line.split(' ')[1:] for line in test_datas]
# random.shuffle(sentences)

In [46]:
# model = gensim.models.Word2Vec(sentences, min_count=1, size=256, workers=-1, iter=100)

In [47]:
# model.save('./word2vec/ad.model')

In [49]:
len(test_datas)

999996

In [50]:
# save('./datas/ad_test.csv', test_datas)

100%|██████████| 999996/999996 [00:01<00:00, 574238.79it/s]


In [13]:
# gender 1 , age 3
set(test_log.user_id.unique()) - set(df_test.user_id.unique())

{3054437, 3086425, 3191252, 3845254}

In [None]:
# length = np.array([len(x) for x in ])