In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import gensim
from tqdm import tqdm

In [3]:
datas = []
with open('train_group.csv', 'r') as f:
    for line in tqdm(f):
        seqences = line.split('\t')[1:]
        datas.append(seqences)

630000it [00:13, 45665.37it/s]


In [51]:
creative_id, ad_id, product_id, advertiser_id = [], [], [], []
age, gender = [], []
time, click_times = [], []
product_category, industry = [], []

In [53]:
for data in tqdm(datas):
    arr = np.array([list(map(int, seq.split('_'))) for seq in data])
    age.append(arr[0, 0])
    gender.append(arr[0, 1])
    creative_id.append(arr[:, 3])
    click_times.append(arr[:, 4])
    ad_id.append(arr[:, 5])
    product_id.append(arr[:, 6])
    product_category.append(arr[:, 7])
    advertiser_id.append(arr[:, 8])
    industry.append(arr[:, 9])

100%|██████████| 630000/630000 [01:30<00:00, 6996.77it/s]


In [55]:
len(age), len(gender), len(creative_id), len(ad_id), len(product_id), len(product_category), len(industry)

(630000, 630000, 630000, 630000, 630000, 630000, 630000)

In [56]:
creative_id[:3]

[array([  63441,  155822,   39714,  609050,   13069,  441462, 1266180,
        1657530, 1696925,  769749, 1074235, 1662244, 1252062, 1145367,
          96192, 1145367, 2085566, 1696925, 2085566,  396652,  157180,
        2369339, 2143574, 2489057,  647980, 2552139, 2496722, 2624965,
        2785305, 2108551, 2862407,  738378, 3246898, 3443654, 2995416,
        3765766, 1416518, 3766271, 3574990, 2270719,   22013,   15558,
         513298, 4194247,  625944]),
 array([ 661347,  808612,  710859,  825434,  593522,  726940,  392052,
        1173863,  862241, 1725184,   72533, 1014211, 2198902, 2073029,
        1899318,   66009, 2081930, 2502798, 3386299, 3403177, 3255755,
         599128, 3696597, 3684862, 3560473, 3491821, 2816845, 2032927,
        3581615, 2556703]),
 array([  39588,  589886,  574787, 1892854, 1962706, 2264105, 1230094,
          31070, 2348342, 2189510, 2728508, 2697265,  765502,   72989,
        2800740, 2482695, 1457862, 3029703, 2907196, 1150678, 2741116,
        2928

In [57]:
creative_id_str = [list(map(str, x)) for x in creative_id]
ad_id_str = [list(map(str, x)) for x in ad_id]
product_id_str = [list(map(str, x)) for x in product_id]
advertiser_id_str = [list(map(str, x)) for x in advertiser_id]

In [58]:
model_1 = gensim.models.Word2Vec(creative_id_str, min_count=1, size=256, workers=-1, iter=20)
model_2 = gensim.models.Word2Vec(ad_id_str, min_count=1, size=256, workers=-1, iter=20)
model_3 = gensim.models.Word2Vec(product_id_str, min_count=1, size=256, workers=-1, iter=20)
model_4 = gensim.models.Word2Vec(advertiser_id_str, min_count=1, size=256, workers=-1, iter=20)

In [59]:
model_1.save('./word2vec/creative_id.model')
model_2.save('./word2vec/ad_id.model')
model_3.save('./word2vec/product_id.model')
model_4.save('./word2vec/advertiser_id.model')

In [66]:
product_category_str = [list(map(str, x)) for x in product_category]
industry_str = [list(map(str, x)) for x in industry]
model_5 = gensim.models.Word2Vec(product_category_str, min_count=1, size=16, workers=-1, iter=20)
model_6 = gensim.models.Word2Vec(industry_str, min_count=1, size=32, workers=-1, iter=20)
model_5.save('./word2vec/product_category.model')
model_6.save('./word2vec/industry.model')

In [11]:
# embedding 不存在用随机初始化替代
def get_embedding(w, model, embedding_dim):
    w = str(w)
    try:
        v = model.wv[w]
    except Exception as e:
        v = np.random.normal(loc=0.0, scale=1e-3, size=embedding_dim)
    return v

In [13]:
def sum_pooling(idss, model, embedding_dim):
    arr = []
    for ids in idss:
        one_user = np.array([get_embedding(w, model, embedding_dim) for w in ids])
        arr.append(one_user.sum(axis=0))
    return np.array(arr)

In [62]:
f_creative_id = sum_pooling(creative_id, model_1)
f_ad_id = sum_pooling(ad_id, model_2)
f_product_id = sum_pooling(product_id, model_3)
f_advertiser_id = sum_pooling(advertiser_id, model_4)

In [67]:
f_product_category = sum_pooling(product_category, model_5)
f_industry = sum_pooling(industry, model_6)

In [63]:
# 添加统计特征
click_nums, n_creative_id, n_ad_id, n_product_id, n_advertiser_id, n_product_category, n_industry = [], [], [], [], [], [], []
for data in tqdm(datas):
    arr = np.array([list(map(int, seq.split('_'))) for seq in data])
    click_nums.append(sum(arr[:, 4]))
    n_creative_id.append(len(set(arr[:, 3])))
    n_ad_id.append(len(set(arr[:, 5])))
    n_product_id.append(len(set(arr[:, 6])))
    n_advertiser_id.append(len(set(arr[:, 8])))
    n_product_category.append(len(set(arr[:, 7])))
    n_industry.append(len(set(arr[:, 9])))

100%|██████████| 630000/630000 [02:03<00:00, 5084.82it/s]


In [64]:
len(click_nums), len(n_creative_id), len(n_ad_id), len(n_product_id), len(n_advertiser_id), len(n_product_category)

(630000, 630000, 630000, 630000, 630000, 630000)

In [68]:
age = np.array(age, dtype=int)
gender = np.array(gender, dtype=int)
# product_category = np.array(product_category, dtype=int)
# industry = np.array(industry, dtype=int)
click_nums = np.array(click_nums, dtype=int)
n_creative_id = np.array(n_creative_id, dtype=int)
n_ad_id = np.array(n_ad_id, dtype=int)
n_product_id = np.array(n_product_id, dtype=int)
n_advertiser_id = np.array(n_advertiser_id, dtype=int)
n_product_category = np.array(n_product_category, dtype=int)
n_industry = np.array(n_industry, dtype=int)

In [69]:
f_feature_group_1 = [age[:, None], gender[:, None], click_nums[:, None], n_creative_id[:, None], n_ad_id[:, None], n_product_id[:, None], n_advertiser_id[:, None],n_product_category[:, None], n_industry[:, None]]
f_feature_group_1 = np.concatenate(f_feature_group_1, axis=1)
columns = ['age', 'gender', 'click_nums', 'n_creative_id', 'n_ad_id', 'n_product_id', 'n_advertiser_id', 'n_product_category', 'n_industry']
df1 = pd.DataFrame(f_feature_group_1, columns=columns, dtype=int)
df1.head()

Unnamed: 0,age,gender,click_nums,n_creative_id,n_ad_id,n_product_id,n_advertiser_id,n_product_category,n_industry
0,9,0,46,42,42,20,36,3,15
1,6,1,30,30,30,17,28,6,8
2,4,0,29,29,29,18,26,6,10
3,3,0,34,33,33,7,30,4,18
4,5,0,17,16,16,6,13,2,10


In [70]:
df1.shape

(630000, 9)

In [77]:
f_feature_group_2 = [f_creative_id, f_ad_id, f_product_id, f_advertiser_id, f_product_category, f_industry]
f_feature_group_2 = np.concatenate(f_feature_group_2, axis=1)
columns = ['f_creative_id_{}'.format(i) for i in range(256)] + ['f_ad_id_{}'.format(i) for i in range(256)] + \
    ['f_product_id_{}'.format(i) for i in range(256)] + ['f_advertiser_id_{}'.format(i) for i in range(256)] + \
    ['f_product_category_{}'.format(i) for i in range(16)] + ['f_industry_{}'.format(i) for i in range(32)]
df2 = pd.DataFrame(f_feature_group_2, columns=columns)
df2.head()

Unnamed: 0,f_creative_id_0,f_creative_id_1,f_creative_id_2,f_creative_id_3,f_creative_id_4,f_creative_id_5,f_creative_id_6,f_creative_id_7,f_creative_id_8,f_creative_id_9,...,f_industry_22,f_industry_23,f_industry_24,f_industry_25,f_industry_26,f_industry_27,f_industry_28,f_industry_29,f_industry_30,f_industry_31
0,0.008487,-0.010115,-0.004467,0.022516,-0.001537,0.002105,0.001844,-0.014507,-0.000807,0.009577,...,-0.283167,-0.139427,-0.208735,-0.07339,0.166172,-0.098235,-0.018001,-0.128303,0.011469,0.029739
1,0.008989,-0.005605,0.004263,0.005424,-0.008389,0.005149,0.012832,-0.000406,0.003622,-0.000654,...,0.015187,-0.170662,-0.162514,-0.13776,0.214335,-0.10789,0.23572,0.060866,0.024728,0.132004
2,-0.008624,-0.001089,0.007464,0.000332,0.001043,0.004396,-0.008182,0.000615,0.000254,-0.006041,...,-0.104666,-0.020134,-0.206185,-0.09215,0.163732,0.007811,0.064205,-0.148316,-0.006584,0.047852
3,0.004517,-0.003767,-0.006177,-0.005585,0.005178,-0.001185,0.005227,-0.002574,0.007146,0.002559,...,-0.061266,-0.017802,-0.171342,-0.169673,0.153425,-0.149567,0.00946,0.002938,-0.028249,0.090356
4,-0.003735,-0.005992,-0.000774,-0.003099,-0.00192,0.006253,0.005343,-0.002564,-0.00522,-0.008895,...,-0.084007,0.030379,-0.044082,-0.032765,0.038733,-0.049661,0.00854,-0.003604,0.020901,0.023973


In [78]:
df2.shape

(630000, 1072)

In [79]:
train_df = pd.concat([df1, df2], axis=1)
train_df.shape

(630000, 1081)

In [80]:
train_df.head()

Unnamed: 0,age,gender,click_nums,n_creative_id,n_ad_id,n_product_id,n_advertiser_id,n_product_category,n_industry,f_creative_id_0,...,f_industry_22,f_industry_23,f_industry_24,f_industry_25,f_industry_26,f_industry_27,f_industry_28,f_industry_29,f_industry_30,f_industry_31
0,9,0,46,42,42,20,36,3,15,0.008487,...,-0.283167,-0.139427,-0.208735,-0.07339,0.166172,-0.098235,-0.018001,-0.128303,0.011469,0.029739
1,6,1,30,30,30,17,28,6,8,0.008989,...,0.015187,-0.170662,-0.162514,-0.13776,0.214335,-0.10789,0.23572,0.060866,0.024728,0.132004
2,4,0,29,29,29,18,26,6,10,-0.008624,...,-0.104666,-0.020134,-0.206185,-0.09215,0.163732,0.007811,0.064205,-0.148316,-0.006584,0.047852
3,3,0,34,33,33,7,30,4,18,0.004517,...,-0.061266,-0.017802,-0.171342,-0.169673,0.153425,-0.149567,0.00946,0.002938,-0.028249,0.090356
4,5,0,17,16,16,6,13,2,10,-0.003735,...,-0.084007,0.030379,-0.044082,-0.032765,0.038733,-0.049661,0.00854,-0.003604,0.020901,0.023973


In [81]:
train_df.to_pickle('./word2vec/train.pkl')

In [2]:
model_1 = gensim.models.Word2Vec.load('./word2vec/creative_id.model')
model_2 = gensim.models.Word2Vec.load('./word2vec/ad_id.model')
model_3 = gensim.models.Word2Vec.load('./word2vec/product_id.model')
model_4 = gensim.models.Word2Vec.load('./word2vec/advertiser_id.model')
model_5 = gensim.models.Word2Vec.load('./word2vec/product_category.model')
model_6 = gensim.models.Word2Vec.load('./word2vec/industry.model')

In [14]:
def process(in_path, out_path):
    creative_id, ad_id, product_id, advertiser_id = [], [], [], []
    age, gender = [], []
#     time, click_times = [], []
    product_category, industry = [], []
    # 添加统计特征
    click_nums, n_creative_id, n_ad_id, n_product_id, n_advertiser_id, n_product_category, n_industry = [], [], [], [], [], [], []
    with open(in_path, 'r') as f:
        for line in tqdm(f):
            seqences = line.split('\t')[1:]
            arr = np.array([list(map(int, seq.split('_'))) for seq in seqences])
            # label
            age.append(arr[0, 0])
            gender.append(arr[0, 1])
            creative_id.append(arr[:, 3])
#             click_times.append(arr[:, 4])
            ad_id.append(arr[:, 5])
            product_id.append(arr[:, 6])
            product_category.append(arr[:, 7])
            advertiser_id.append(arr[:, 8])
            industry.append(arr[:, 9])
            # 统计类特征
            click_nums.append(sum(arr[:, 4]))
            n_creative_id.append(len(set(arr[:, 3])))
            n_ad_id.append(len(set(arr[:, 5])))
            n_product_id.append(len(set(arr[:, 6])))
            n_advertiser_id.append(len(set(arr[:, 8])))
            n_product_category.append(len(set(arr[:, 7])))
            n_industry.append(len(set(arr[:, 9])))
    print(len(creative_id), len(ad_id), len(product_id), len(advertiser_id), len(age))
#     import pdb;pdb.set_trace()
    print(age[:10])
    f_creative_id = sum_pooling(creative_id, model_1, 256)
    f_ad_id = sum_pooling(ad_id, model_2, 256)
    f_product_id = sum_pooling(product_id, model_3, 256)
    f_advertiser_id = sum_pooling(advertiser_id, model_4, 256)
    f_product_category = sum_pooling(product_category, model_5, 16)
    f_industry = sum_pooling(industry, model_6, 32)
    
    age = np.array(age, dtype=int)
    gender = np.array(gender, dtype=int)
    click_nums = np.array(click_nums, dtype=int)
    n_creative_id = np.array(n_creative_id, dtype=int)
    n_ad_id = np.array(n_ad_id, dtype=int)
    n_product_id = np.array(n_product_id, dtype=int)
    n_advertiser_id = np.array(n_advertiser_id, dtype=int)
    n_product_category = np.array(n_product_category, dtype=int)
    n_industry = np.array(n_industry, dtype=int)
    
    f_feature_group_1 = [age[:, None], gender[:, None], click_nums[:, None], n_creative_id[:, None], n_ad_id[:, None], n_product_id[:, None], n_advertiser_id[:, None],n_product_category[:, None], n_industry[:, None]]
    f_feature_group_1 = np.concatenate(f_feature_group_1, axis=1)
    columns = ['age', 'gender', 'click_nums', 'n_creative_id', 'n_ad_id', 'n_product_id', 'n_advertiser_id', 'n_product_category', 'n_industry']
    df1 = pd.DataFrame(f_feature_group_1, columns=columns, dtype=int)
    
    f_feature_group_2 = [f_creative_id, f_ad_id, f_product_id, f_advertiser_id, f_product_category, f_industry]
    f_feature_group_2 = np.concatenate(f_feature_group_2, axis=1)
    columns = ['f_creative_id_{}'.format(i) for i in range(256)] + ['f_ad_id_{}'.format(i) for i in range(256)] + \
        ['f_product_id_{}'.format(i) for i in range(256)] + ['f_advertiser_id_{}'.format(i) for i in range(256)] + \
        ['f_product_category_{}'.format(i) for i in range(16)] + ['f_industry_{}'.format(i) for i in range(32)]
    df2 = pd.DataFrame(f_feature_group_2, columns=columns)
    
    df = pd.concat([df1, df2], axis=1)
    df.to_pickle(out_path)
    return df

In [15]:
val_df = process('val_group.csv', './word2vec/val.pkl')
val_df.head()

135000it [00:28, 4683.77it/s]


135000 135000 135000 135000 135000
[3, 5, 6, 2, 3, 5, 1, 1, 3, 1]


Unnamed: 0,age,gender,click_nums,n_creative_id,n_ad_id,n_product_id,n_advertiser_id,n_product_category,n_industry,f_creative_id_0,...,f_industry_22,f_industry_23,f_industry_24,f_industry_25,f_industry_26,f_industry_27,f_industry_28,f_industry_29,f_industry_30,f_industry_31
0,3,0,14,12,12,6,12,3,9,0.000514,...,-0.023894,0.023569,-0.011116,-0.006597,0.002057,-0.045657,0.017037,-0.015626,0.046506,0.021147
1,5,1,10,10,10,5,10,4,8,-0.001525,...,-0.016439,0.011322,0.004477,-0.018333,0.011657,-0.004234,0.039023,-0.011462,0.021666,-0.016961
2,6,0,56,45,45,18,31,5,15,-0.000359,...,-0.034426,0.166529,-0.176119,-0.19755,-0.166245,-0.126803,0.116033,-0.272958,-0.12849,0.16733
3,2,1,20,18,18,7,17,4,9,0.004194,...,-0.093434,-0.026102,-0.084669,-0.023383,0.087739,-0.047008,0.063857,-0.060413,0.036713,0.027357
4,3,0,17,14,14,6,10,2,4,0.013038,...,-0.007565,0.002949,-0.089907,-0.050586,0.11108,-0.117876,0.026739,-0.00068,0.02234,0.032577


In [16]:
test_df = process('test_group.csv', './word2vec/test.pkl')
test_df.head()

134999it [00:28, 4746.54it/s]


134999 134999 134999 134999 134999
[4, 4, 2, 1, 3, 3, 1, 6, 5, 5]


Unnamed: 0,age,gender,click_nums,n_creative_id,n_ad_id,n_product_id,n_advertiser_id,n_product_category,n_industry,f_creative_id_0,...,f_industry_22,f_industry_23,f_industry_24,f_industry_25,f_industry_26,f_industry_27,f_industry_28,f_industry_29,f_industry_30,f_industry_31
0,4,0,14,11,11,5,11,5,6,0.005593,...,-0.122852,0.018458,-0.134982,-0.083445,0.110419,-0.048232,-0.010352,-0.078555,-0.003744,0.057744
1,4,1,33,29,29,11,25,4,19,-0.004063,...,-0.052801,0.040497,-0.120739,-0.133838,0.059935,-0.08575,-0.023811,-0.019511,-0.044966,0.118492
2,2,0,53,47,47,10,31,4,20,0.005815,...,-0.12695,0.071603,-0.317156,0.043249,0.196016,-0.146334,0.079844,-0.133096,0.10446,0.169656
3,1,1,15,13,13,7,9,3,7,-0.006537,...,-0.079904,-0.05378,-0.015708,0.025063,0.065593,-0.017647,0.082941,-0.049714,0.00016,0.000246
4,3,0,15,14,14,8,13,4,8,0.006833,...,0.019135,0.050291,-0.069139,-0.024612,-0.034093,0.064728,0.004353,-0.042804,-0.061509,-0.004131
