In [1]:
# !mkdir /home/aistudio/external-libraries

In [2]:
# !conda install catboost -p /home/aistudio/external-libraries

In [3]:
!pip install gensim lightgbm

Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import gc
%matplotlib inline

In [5]:
# zipfile.ZipFile('data/data36883/腾讯.zip').extractall('./work')

In [6]:
# zipfile.ZipFile('work/train_preliminary.zip').extractall('./work')
# zipfile.ZipFile('work/test.zip').extractall('./work')

### Load data

In [7]:
train_path = "work/train_preliminary/"
test_path = "work/test/"

In [8]:
train_ad = pd.read_csv(train_path + "ad.csv")
train_click_log = pd.read_csv(train_path + "click_log.csv")
train_user = pd.read_csv(train_path + "user.csv")

In [9]:
test_ad = pd.read_csv(test_path + "ad.csv")
test_click_log = pd.read_csv(test_path + "click_log.csv")

In [10]:
# 训练数据数据的 ad click_log 和 用户人数
train_ad.shape, train_click_log.shape, train_user.shape, len(train_click_log.user_id.unique())

((2481135, 6), (30082771, 4), (900000, 3), 900000)

In [11]:
# 测试数据的 ad click_log 和 用户人数
test_ad.shape, test_click_log.shape, len(test_click_log.user_id.unique())

((2618159, 6), (33585512, 4), 1000000)

In [12]:
train_ad.head(2)

Unnamed: 0,creative_id,ad_id,product_id,product_category,advertiser_id,industry
0,1,1,\N,5,381,78
1,4,4,\N,5,108,202


In [13]:
train_click_log.tail(2)

Unnamed: 0,time,user_id,creative_id,click_times
30082769,86,30920,2713031,1
30082770,25,30920,629802,1


In [14]:
train_user.head(2)

Unnamed: 0,user_id,age,gender
0,1,4,1
1,2,10,1


In [15]:
# # train_user 和 train_click_log 的 user_id 是一致的
# set(train_user.user_id) == set(train_click_log.user_id)

In [16]:
# # train_ad 和 train_click_log 的 creative_id 是一致的
# len(set(train_ad.creative_id)), len(set(train_click_log.creative_id))
# set(train_ad.creative_id) == set(train_click_log.creative_id)

In [17]:
test_click_log.head(2)

Unnamed: 0,time,user_id,creative_id,click_times
0,20,3131989,645764,1
1,20,3131989,1027422,1


In [18]:
df_train = train_ad.merge(train_click_log, on = "creative_id", how = "left")

In [19]:
df_test = test_ad.merge(test_click_log, on = "creative_id", how = "left")

In [20]:
df_train.head(2)

Unnamed: 0,creative_id,ad_id,product_id,product_category,advertiser_id,industry,time,user_id,click_times
0,1,1,\N,5,381,78,81,398695,1
1,1,1,\N,5,381,78,82,404020,1


In [21]:
df_test.head(2)

Unnamed: 0,creative_id,ad_id,product_id,product_category,advertiser_id,industry,time,user_id,click_times
0,1,1,\N,5,381,78,81,3153317,1
1,1,1,\N,5,381,78,81,3284714,1


In [22]:
df_data = pd.concat([df_train, df_test], ignore_index = True)

### EDA

In [23]:
# # 缺失值是 '\\N', 先替换成nan.
# df_data = df_data.replace('\\N', np.nan)

In [24]:
# 缺失值
# (df_data.isnull().sum()).plot.bar(title='Data null')

In [25]:
# plt.rcParams['font.sans-serif']=['SimHei']
# train_user['age'].value_counts().plot.pie(autopct='%1.1f%%',title = '年龄分布比例')

In [26]:
# # 性别比例大约是2:1
# train_user['gender'].value_counts().plot.bar(title = '性别分布比例')

In [27]:
# df_data[["creative_id", "ad_id", "user_id"]].boxplot()

In [28]:
# df_data[["advertiser_id", "product_category", "time", "click_times"]].boxplot()

### 提取特征

In [29]:
# # 缺失值用-1补充
# df_data = df_data.fillna(-1)

In [30]:
# for col in df_data.columns:
#     print(col, ":", type(df_data[col][0]))

In [31]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
# for col in ["creative_id", "ad_id", "product_category", "advertiser_id", "time", "click_times"]:
#     le = LabelEncoder()
#     df_data[col] = le.fit_transform(df_data[col])

In [32]:
Feats = df_data.drop_duplicates(subset = "user_id")[["user_id"]]

In [33]:
del df_data
gc.collect()

20

In [34]:
#构造统计特征
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    df[target] = df[target].astype(float)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

In [35]:
# for col in ["creative_id", "ad_id", "product_category", "advertiser_id", "time", "click_times"]:
#     t = group_feature(df_data, "user_id", col, ['max','min','mean','sum','std', 'nunique', 'count'])
#     Feats = Feats.merge(t, on = "user_id", how = "left")

In [36]:
def get_count_tfidf(df, group_id, group_target, num):
    df[group_target] = df[group_target].astype(str)
    tmp = df.groupby(group_id)[group_target].agg(list).reset_index()
    tmp[group_target] = tmp[group_target].apply(lambda x: ' '.join(x))

    tfidf_enc_tmp = TfidfVectorizer(max_features = 100000, min_df = 3)
    tfidf_vec_tmp = tfidf_enc_tmp.fit_transform(tmp[group_target])
    svd_tag_tmp = TruncatedSVD(n_components=num, n_iter=5, random_state=52)
    tag_svd_tmp = svd_tag_tmp.fit_transform(tfidf_vec_tmp)
    tag_svd_tmp = pd.DataFrame(tag_svd_tmp)
    tag_svd_tmp.columns = ['{}_tfidf_{}'.format(group_target, i)
                           for i in range(num)]

    countvec = CountVectorizer(max_features = 100000, min_df = 3)
    count_vec_tmp = countvec.fit_transform(tmp[group_target])
    svd_tmp = TruncatedSVD(n_components=num, n_iter=5, random_state=52)
    svd_tmp = svd_tmp.fit_transform(count_vec_tmp)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_countvec_{}'.format(group_target, i)
                       for i in range(num)]

    return pd.concat([tmp[[group_id]], tag_svd_tmp, svd_tmp], axis=1)

In [37]:
# # for col in ["creative_id", "ad_id", "advertiser_id"]:
# for col in ["advertiser_id"]:
#     t = get_count_tfidf(df_data, "user_id", col, 30)
#     Feats = Feats.merge(t, on = "user_id", how = "left")

In [38]:
def hashfxn(astring):
    return ord(astring[0])
def w2v_feat(df, group_id, feat, length):
    df[feat] = df[feat].astype(str)
    data_frame = df.groupby(group_id)[feat].agg(list).reset_index()
    model = Word2Vec(data_frame[feat].values,
                     size=length,
                     window=5,
                     min_count=1,
                     workers=8,
                     iter=5,
                     seed=1,
                     hashfxn=hashfxn)
    data_frame[feat] = data_frame[feat].apply(
        lambda x: np.array([model.wv[c] for c in x])
    )
    
    for m in range(length):
        data_frame['w2v_{}_mean'.format(m)] = data_frame[feat].apply(
            lambda x: x[:, m].mean()
        )
    del data_frame[feat]
    gc.collect()
    return data_frame

In [39]:
# for col in ["creative_id", "ad_id", "advertiser_id"]:
# for col in ["advertiser_id"]:
#     t = w2v_feat(df_data, "user_id", col, 50)
#     Feats = Feats.merge(t, on = "user_id", how = "left")

In [None]:
feat_files = ['creative_id_w2v_feats.csv', 'ad_id_w2v_feats.csv', 'advertiser_id_w2v_feats.csv',
    'ad_id_tfidf_feats.csv', 'advertiser_id_tfidf_feats.csv', 'creative_id_tfidf_feats.csv', 'group_feats.csv']

In [None]:
for df_file in feat_files:
    t = pd.read_csv(df_file)
    Feats = Feats.merge(t, on = "user_id", how = "left")
    del t
    gc.collect()

In [None]:
Feats = Feats.set_index('user_id')

In [None]:
# Feats.to_csv("./group_feats.csv")

In [None]:
X_train = Feats.iloc[train_user.user_id].sort_index()

In [None]:
X_test = Feats.drop(train_user.user_id, axis = 0).sort_index()

In [None]:
y_age = train_user[['age','user_id']].set_index("user_id")
y_gender = train_user[['gender','user_id']].set_index("user_id")

In [None]:
age_map = {i: i-1 for i in range(1,11)}
age_map_rev = {v:k for k,v in age_map.items()}
gender_map = {1: 1, 2: 0}
gender_map_rev = {v:k for k,v in gender_map.items()}

In [None]:
y_age.age = y_age.age.map(age_map)
y_gender.gender = y_gender.gender.map(gender_map)

In [None]:
X_train.shape, X_test.shape, y_age.shape, y_gender.shape

((900000, 372), (1000000, 372), (900000, 1), (900000, 1))

### Model

In [None]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [None]:
lgb_params_age = {
    'learning_rate' : 0.1,
    # 'min_child_samples': 5,
    'max_depth': 7,
    'lambda_l1': 2,
    # 'feature_fraction': .75,
    # 'bagging_fraction': .85,
    # 'seed': 99,
    'n_estimators': 3000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 10,
    'early_stopping_rounds': 100,
}
fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
models_age = []
pred_age = np.zeros((X_test.shape[0], 10))
oof_age = np.zeros((X_train.shape[0], 10))

In [None]:
for index, (train_idx, val_idx) in enumerate(fold.split(X_train, y_age)):

    train_set = lgb.Dataset(X_train.iloc[train_idx], y_age.iloc[train_idx])
    val_set = lgb.Dataset(X_train.iloc[val_idx], y_age.iloc[val_idx])

    model = lgb.train(lgb_params_age, train_set, valid_sets=[train_set, val_set],verbose_eval=100)
    models_age.append(model)

    val_pred = model.predict(X_train.iloc[val_idx])
    oof_age[val_idx] = val_pred
    val_y = y_age.age.values[val_idx]
    val_pred = np.argmax(val_pred, axis = 1)

    print(index+1, 'val acc:', metrics.accuracy_score(val_y, val_pred))
    test_pred = model.predict(X_test)
    pred_age += test_pred/fold.n_splits
    del train_set, val_set, val_pred, val_y, test_pred
    gc.collect()



Training until validation scores don't improve for 100 rounds


In [None]:
lgb_params_gender = {
    'n_estimators': 3000,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    # 'metric': 'None',
    # 'num_leaves': 63,
    # 'subsample': 0.8,
    # 'colsample_bytree': 0.8,
    'learning_rate': 0.05,
    'lambda_l2':2,
    'nthread': -1,
    # 'silent': True,
    'early_stopping_rounds': 100,
}
models_gender = []
oof_gender = np.zeros(X_train.shape[0])
pred_gender = np.zeros(X_test.shape[0])

In [None]:
for index, (train_idx, val_idx) in enumerate(fold.split(X_train, y_gender)):

    train_set = lgb.Dataset(X_train.iloc[train_idx], y_gender.iloc[train_idx])
    val_set = lgb.Dataset(X_train.iloc[val_idx], y_gender.iloc[val_idx])

    model = lgb.train(lgb_params_gender, train_set, valid_sets=[train_set, val_set], verbose_eval=100)
    models_gender.append(model)
    val_pred = model.predict(X_train.iloc[val_idx])
    oof_gender[val_idx] = val_pred
    val_y = y_gender.values.reshape((-1))[val_idx]
    val_pred = np.round(val_pred)
    
    print(index+1, 'val acc:', metrics.accuracy_score(val_y, val_pred))
    test_pred = model.predict(X_test)
    pred_gender += test_pred/fold.n_splits
    del train_set, val_set, val_pred, val_y, test_pred
    gc.collect()
# 372feats : 42 stat, 150 w2v, 180 tf_idf
# 线下 0.717
# 线上 0.876774

In [None]:
np.unique(np.argmax(pred_age, axis=1))

In [None]:
np.unique(np.round(pred_gender))

### Feature importance

In [None]:
ret = []
for index, model in enumerate(models_age):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

df = df.groupby('name', as_index=False)['score'].mean()
df.sort_values(['score'], ascending=False)

In [None]:
ret = []
for index, model in enumerate(models_gender):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

df = df.groupby('name', as_index=False)['score'].mean()
df.sort_values(['score'], ascending=False)

### Submission

In [None]:
sub = pd.DataFrame()

In [None]:
sub['user_id'] = X_test.index
sub['predicted_age'] = np.argmax(pred_age, axis = 1)
# sub['predicted_age'] = -1
sub['predicted_gender'] = np.round(pred_gender).astype(int)

In [None]:
sub["predicted_gender"] = sub["predicted_gender"].map(gender_map_rev)
sub["predicted_age"] = sub["predicted_age"].map(age_map_rev)

In [None]:
sub["predicted_gender"].value_counts()

In [None]:
sub["predicted_age"].value_counts()

In [None]:
sub.to_csv("./submission.csv", header = True, index = False, encoding='utf-8')

In [66]:
!cat submission.csv

316779

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

