In [1]:
import pandas as pd
import numpy as np
#import matplotlib
#import matplotlib.pyplot as plt
from collections import Counter
#from scipy.stats import stats
#import lightgbm as lgb
%matplotlib inline
import gc
#from sklearn.model_selection import StratifiedKFold, GridSearchCV
#from itertools import combinations, product

# Memory optimize

In [2]:
%%time
register = pd.read_table('../raw_data/user_register_log.txt', sep='\t', header=None, 
             names=['user_id', 'register_day',
                    'register_type', 'device_type']).sort_values(
                                            by=['user_id', 'register_day'])
register =  register.apply(pd.to_numeric, downcast='unsigned')
launch = pd.read_table('../raw_data/app_launch_log.txt', sep='\t', header=None,
                      names=['user_id', 'launch_day']).sort_values(by=['user_id',
                                                                     'launch_day'])
launch =  launch.apply(pd.to_numeric, downcast='unsigned')
video = pd.read_table('../raw_data/video_create_log.txt', sep='\t', header=None,
                     names=['user_id', 'create_day']).sort_values(by=['user_id',
                                                                     'create_day'])
video =  video.apply(pd.to_numeric, downcast='unsigned')
activity = pd.read_table('../raw_data/user_activity_log.txt', sep='\t', header=None,
                        names=['user_id', 'act_day', 'act_page', 'video_id', 'author_id',
                              'act_type']).sort_values(by=['user_id', 'act_day'])
activity =  activity.apply(pd.to_numeric, downcast='unsigned')
register.reset_index(drop=True).to_pickle('../tmp_data/register.pkl',)
launch.reset_index(drop=True).to_pickle('../tmp_data/launch.pkl',)
video.reset_index(drop=True).to_pickle('../tmp_data/create.pkl',)
activity.reset_index(drop=True).to_pickle('../tmp_data/activity.pkl')
del register, launch, video, activity
gc.collect()

CPU times: user 24.7 s, sys: 7.39 s, total: 32.1 s
Wall time: 32.2 s


In [None]:
!tar -jcvf <要生成的压缩文件名> <要压缩的文件> # 压缩
!tar -jxvf <要解压缩的文件> # 解压缩

# load preprocess raw data 

In [None]:
%%time
register = pd.read_pickle('/home/kesci/input/register.pkl',)
launch = pd.read_pickle('/home/kesci/input/launch.pkl',)
create = pd.read_pickle('/home/kesci/input/create.pkl',)
activity = pd.read_pickle('/home/kesci/input/activity.pkl')

In [None]:
print('register user:', len(register['user_id']),
      'launch user:', len(np.unique(launch['user_id'])),
      'create user:', len(np.unique(create['user_id'])),
      'activity user:', len(np.unique(activity['user_id'])),)

# 划分数据集

In [None]:
def cut_data(register, create, launch, activity, s_day_user, e_day_user, s_day_fea, e_day_fea, *label_cut_day):
    train_register = register[(s_day_user <= register['register_day']) & 
         (register['register_day'] <= e_day_user)].reset_index(drop=True)
    train_create = create[(s_day_fea <= create['create_day']) & 
         (create['create_day'] <= e_day_fea)].reset_index(drop=True)
    train_launch = launch[(s_day_fea <= launch['launch_day']) & 
         (launch['launch_day'] <= e_day_fea)].reset_index(drop=True)
    train_activity = activity[(s_day_fea <= activity['act_day']) & 
         (activity['act_day'] <= e_day_fea)].reset_index(drop=True)
    train_activity['author_equal_user'] = train_activity['user_id'] == train_activity['author_id']
    if len(label_cut_day) == 2:
        s_day_label = label_cut_day[0]
        e_day_label = label_cut_day[1]
        test_register = register[(s_day_label <= register['register_day']) & 
             (register['register_day'] <= e_day_label)]
        test_create = create[(s_day_label <= create['create_day']) & 
             (create['create_day'] <= e_day_label)]
        test_launch = launch[(s_day_label <= launch['launch_day']) & 
             (launch['launch_day'] <= e_day_label)]
        test_activity = activity[(s_day_label <= activity['act_day']) & 
             (activity['act_day'] <= e_day_label)]
        test_user_id = set(test_register.user_id.values.tolist() + \
                        test_create.user_id.values.tolist() + \
                          test_launch.user_id.values.tolist() + \
                            test_activity.user_id.values.tolist())
        train_register['label'] = [1 if i in test_user_id else 0 for i in train_register.user_id]
    return train_register, train_create, train_launch, train_activity       

# 生成基础统计特征

In [None]:
#每个表各自生成基础统计特征 6，7，13，14，21，22，27，28日的数据会出现一次突增
def get_register_feature(train_register, label_day):
    # 注册类型
    # 设备类型
    # 'register_type', 'device_type', 'register_day_cut_max_day',
    train_register['register_day_to_label'] = label_day - train_register['register_day']
    return train_register

def get_create_feature(group, label_day):
    feature = pd.Series()
    feature['create_count'] = len(group) # 创建视频次数    
    # 一天最多创建多少视频
    feature['max_create_for_oneday'] = max(Counter(group['create_day']).values())
    # 所有创建视频日与打标签日之间的距离的最大值，最小值，方差,均值 ,峰度 ,偏度
    dis2lab = label_day - np.unique(group['create_day'])
    feature['max_create_day2label']= np.max(dis2lab)
    feature['min_create_day2label']= np.min(dis2lab)
    feature['mean_create_day2label'] = np.mean(dis2lab)
    feature['median_create_day2label'] = np.median(dis2lab)
    feature['std_create_day2label'] = np.std(dis2lab)
    # 所有创建视频日与注册日之间的距离的最大值，最小值，方差,均值 ,峰度 ,偏度
    dis2reg = np.unique(group['create_day']) - list(group['register_day'])[0]
    feature['max_create_day2regis'] = np.max(dis2reg)
    feature['min_create_day2regis'] = np.min(dis2reg)
    feature['mean_create_day2regis'] = np.mean(dis2reg)
    feature['median_create_day2regis'] = np.median(dis2reg)
    feature['std_create_day2regis'] = np.std(dis2reg)
 
    #gap
    gap4cre = dis2reg - dis2lab
    feature['max_gap4cre'] = np.max(gap4cre)
    feature['min_gap4cre'] = np.min(gap4cre)
    feature['mean_gap4cre'] = np.mean(gap4cre)
    feature['median_gap4cre'] = np.median(gap4cre)
    feature['std_gap4cre'] = np.std(gap4cre)
 

    feature['mean_daily_create'] = len(group) / (len(np.unique(group['create_day'])))
    diff_day = np.diff(np.unique(group['create_day']))
    diff_list = ['1' if v == 1 else ' ' for v in diff_day]
    if len(diff_day) != 0: # 连续创建视频的最大天数,创建视频间隔的均值，方差，最大值，最小值，
        if '1' in diff_list:
            feature['max_contin_create_days'] = 1 +  max([len(x) for\
                                                     x in ''.join(diff_list).split()])
        feature['create_day_diff_mean'] = np.mean(diff_day)
        feature['create_day_diff_std'] = np.std(diff_day)
        feature['create_day_diff_max'] = np.max(diff_day)
        feature['create_day_diff_min'] = np.min(diff_day)
    return feature

def get_launch_feature(group, label_day):
    feature = pd.Series()
    feature['launch_count'] = len(group) # 启动次数
    # 所有启动日与打标签日之间的距离的最大值，最小值，方差,均值
    dis2lab = label_day - group['launch_day']
    feature['max_launch_day2label']= np.max(dis2lab)
    feature['min_launch_day2label']= np.min(dis2lab)
    feature['mean_launch_day2label'] = np.mean(dis2lab)
    feature['median_launch_day2label'] = np.median(dis2lab)
    feature['std_launch_day2label'] = np.std(dis2lab)

    
    # 所有启动日与注册日之间的距离的最大值，最小值，方差,均值 ,峰度 ,偏度
    dis2reg = group['launch_day'] - list(group['register_day'])[0]
    feature['max_launch_day2regis'] = np.max(dis2reg)
    feature['min_launch_day2regis'] = np.min(dis2reg)
    feature['mean_launch_day2regis'] = np.mean(dis2reg)
    feature['median_launch_day2regis'] = np.median(dis2reg)
    feature['std_launch_day2regis'] = np.std(dis2reg)
   
    #gap
    gap4lau = dis2reg - dis2lab
    feature['max_gap4lau'] = np.max(gap4lau)
    feature['min_gap4lau'] = np.min(gap4lau)
    feature['mean_gap4lau'] = np.mean(gap4lau)
    feature['median_gap4lau'] = np.median(gap4lau)
    feature['std_gap4lau'] = np.std(gap4lau)
  
    
    diff_day = np.diff(group['launch_day'])
    diff_list = ['1' if v == 1 else ' ' for v in diff_day]
    if len(diff_day) != 0: # 连续启动app的最大天数, 启动间隔的均值，方差，最大值，最小值, 峰度， 偏度
        if '1' in diff_list:
            feature['max_contin_launch_days'] = 1 + max([len(x) for x in ''.join(diff_list).split()])
        feature['launch_day_diff_mean'] = np.mean(diff_day)
        feature['launch_day_diff_std'] = np.std(diff_day)
        feature['launch_day_diff_max'] = np.max(diff_day)
        feature['launch_day_diff_min'] = np.min(diff_day)
     
    return feature

def get_activity_feature(group, label_day): 
    feature = pd.Series()
    feature['activity_count'] = len(group)
    feature['sum_author_equal_user'] = group['author_equal_user'].sum() #用户等于视频作者用户的次数
    feature['max_repeat_video'] = max(Counter(group['video_id']).values()) #用户重复操作某个视频的最大次数
    feature['max_repeat_author'] = max(Counter(group['author_id']).values()) #用户关于某个作者最多发生多少次操作行为    feature['mean_daily_act'] = len(group) / len(np.unique(group['act_day']))
    
    # 一天最多发生多少次活动
    feature['max_act_for_oneday'] = max(Counter(group['act_day']).values())
    feature['mean_daily_act'] = len(group) / len(np.unique(group['act_day']))

    # 所有发生行为日与打标签日之间的距离的最大值，最小值，方差,均值，峰度， 偏度
    dis2lab = label_day - np.unique(group['act_day'])
    feature['max_act_day2label']= np.max(dis2lab)
    feature['min_act_day2label']= np.min(dis2lab)
    feature['mean_act_day2label'] = np.mean(dis2lab)
    feature['median_act_day2label'] = np.median(dis2lab)
    feature['std_act_day2label'] = np.std(dis2lab)
   
    
    # 所有发生行为日与注册日之间的距离的最大值，最小值，方差,均值 ,峰度 ,偏度
    dis2reg = np.unique(group['act_day']) - list(group['register_day'])[0]
    feature['max_act_day2regis'] = np.max(dis2reg)
    feature['min_act_day2regis'] = np.min(dis2reg)
    feature['mean_act_day2regis'] = np.mean(dis2reg)
    feature['median_act_day2regis'] = np.median(dis2reg)
    feature['std_act_day2regis'] = np.std(dis2reg)
 
    #gap
    gap4act = dis2reg - dis2lab
    feature['max_gap4act'] = np.max(gap4act)
    feature['min_gap4act'] = np.min(gap4act)
    feature['mean_gap4act'] = np.mean(gap4act)
    feature['median_gap4act'] = np.median(gap4act)
    feature['std_gap4act'] = np.std(gap4act)
  
    
    
    
    diff_day = np.diff(np.unique(group['act_day']))
    diff_list = ['1' if v == 1 else ' ' for v in diff_day]
    if len(diff_day) != 0: # 连续发生行为的最大天数，发生行为天数间隔的均值，方差，最大值，最小值，
        if '1' in diff_list:
            feature['max_contin_act_days'] = 1 + max([len(x) for x in ''.join(diff_list).split()])
        feature['act_day_diff_mean'] = np.mean(diff_day)
        feature['act_day_diff_std'] = np.std(diff_day)
        feature['act_day_diff_max'] = np.max(diff_day)
        feature['act_day_diff_min'] = np.min(diff_day)
     
    feature['0_actpage_count'] = np.sum(group['act_page'] == 0)
    feature['1_actpage_count'] = np.sum(group['act_page'] == 1)
    feature['2_actpage_count'] = np.sum(group['act_page'] == 2)
    feature['3_actpage_count'] = np.sum(group['act_page'] == 3)
    feature['4_actpage_count'] = np.sum(group['act_page'] == 4)
    feature['0_actpage_count_div_sum'] = np.sum(group['act_page'] == 0) / len(group['act_page'])
    feature['1_actpage_count_div_sum'] = np.sum(group['act_page'] == 1) / len(group['act_page'])
    feature['2_actpage_count_div_sum'] = np.sum(group['act_page'] == 2) / len(group['act_page'])
    feature['3_actpage_count_div_sum'] = np.sum(group['act_page'] == 3) / len(group['act_page'])
    feature['4_actpage_count_div_sum'] = np.sum(group['act_page'] == 4) / len(group['act_page'])
    
    feature['0_actype_count'] = np.sum(group['act_type'] == 0)
    feature['1_actype_count'] = np.sum(group['act_type'] == 1)
    feature['2_actype_count'] = np.sum(group['act_type'] == 2)
    feature['3_actype_count'] = np.sum(group['act_type'] == 3)
    feature['4_actype_count'] = np.sum(group['act_type'] == 4)
    feature['5_actype_count'] = np.sum(group['act_type'] == 5)
    feature['0_actype_count_div_sum'] = np.sum(group['act_type'] == 0) / len(group['act_type'])
    feature['1_actype_count_div_sum'] = np.sum(group['act_type'] == 1) / len(group['act_type'])
    feature['2_actype_count_div_sum'] = np.sum(group['act_type'] == 2) / len(group['act_type'])
    feature['3_actype_count_div_sum'] = np.sum(group['act_type'] == 3) / len(group['act_type'])
    feature['4_actype_count_div_sum'] = np.sum(group['act_type'] == 4) / len(group['act_type'])
    feature['5_actype_count_div_sum'] = np.sum(group['act_type'] == 5) / len(group['act_type'])
    return feature
def deal_feature(train_register, train_create, train_launch, train_act, label_day):
    train = get_register_feature(train_register, label_day)    
    act_author = pd.DataFrame()
    act_author['user_id'] = train_act['author_id']
    act_author['size'] = 1
    reg_author = pd.merge(train[['user_id']], act_author, on='user_id', how='left')
    impact = reg_author.groupby('user_id')['size'].apply(np.sum).values
    train['user_impact'] = impact
    print('register表特征提取完毕')
    
    
    train_create = pd.merge(train_create, train_register[['user_id', 'register_day']],
                  on='user_id', how='left')
    cre_feature = train_create.groupby('user_id', sort=True).apply(get_create_feature, label_day)
    cre_feature = cre_feature.reset_index().pivot(index='user_id', columns='level_1', values=0).rename_axis(None, axis=1)
    cre_feature = cre_feature.reset_index()
    train = pd.merge(train, pd.DataFrame(cre_feature),
                      on='user_id', how='left')
    print('create表特征提取完毕')

    train_launch = pd.merge(train_launch, train_register[['user_id', 'register_day']],
                  on='user_id', how='left')
    lau_feature = train_launch.groupby('user_id', sort=True).apply(get_launch_feature, label_day)
    lau_feature = lau_feature.reset_index().pivot(index='user_id', columns='level_1', values=0).rename_axis(None, axis=1)
    lau_feature = lau_feature.reset_index()
    train = pd.merge(train, pd.DataFrame(lau_feature),
                      on='user_id', how='left')
    print('launch表特征提取完毕')
    
    train_act = pd.merge(train_act, train_register[['user_id', 'register_day']],
                  on='user_id', how='left')
    act_feature = train_act.groupby('user_id', sort=True).apply(get_activity_feature, label_day)
    act_feature = act_feature.reset_index().pivot(index='user_id', columns='level_1', values=0).rename_axis(None, axis=1)
    act_feature = act_feature.reset_index()
    train = pd.merge(train, pd.DataFrame(act_feature),
                      on='user_id', how='left')
    print('activity表特征提取完毕')
    if 'register_day' in train.columns:
        del train['register_day']
    return train

In [None]:
%%time 
#--------------------------------------------cut1---------------------------------------------------------
train_reg1, train_cre1, train_lau1, train_act1 = cut_data(register, create, launch, activity,1, 16, 1, 16, 17, 23)
train1 = deal_feature(train_reg1, train_cre1, train_lau1, train_act1, 17)
train1.to_hdf('../tmp_data/cut_train1.h5', key='train1', complevel=9, complib='zlib')
del train_reg1, train_cre1, train_lau1, train_act1, train1
gc.collect()

train_reg2, train_cre2, train_lau2, train_act2 = cut_data(register, create, launch, activity,1, 23, 8, 23, 24, 30)
train2 = deal_feature(train_reg2, train_cre2, train_lau2, train_act2, 24)
train2.to_hdf('../tmp_data/cut_train2.h5', key='train2', complevel=9, complib='zlib')
print('train done')
del train_reg2, train_cre2, train_lau2, train_act2, train2
gc.collect()

test_reg, test_cre, test_lau, test_act = cut_data(register, create, launch, activity,1, 30, 15, 30)
test = deal_feature(test_reg, test_cre, test_lau, test_act, 31)
test.to_hdf('../tmp_data/cut_test.h5', key='test', complevel=9, complib='zlib')
del test_reg, test_cre, test_lau, test_act, test
del register, create, launch, activity
gc.collect()

# add_fea

In [None]:
#每个表各自生成基础统计特征 6，7，13，14，21，22，27，28日的数据会出现一次突增
def add_register_feature(train_register):
    # 注册类型
    # 设备类型
    # 'register_type', 'device_type', 'register_day_cut_max_day',
    return train_register[['user_id']]

def add_create_feature(group):
    feature = pd.Series()
    # 一天最少创建多少视频
    feature['min_create_for_oneday'] = min(list(Counter(group['create_day']).values()))
    feature['std_create_daily'] = np.std(list(Counter(group['create_day']).values()))
    return feature

def add_activity_feature(group): 
    feature = pd.Series()
    feature['unique_video'] = len(np.unique(group['video_id']))
    feature['unique_author'] = len(np.unique(group['author_id']))
    
    # 一天最少发生多少次活动
    feature['min_act_for_oneday'] = min(list(Counter(group['act_day']).values()))
    feature['std_act_daily'] = np.std(list(Counter(group['act_day']).values()))
    
    count_video = list(Counter(group['video_id']).values())
    feature['min_video_num'] = np.min(count_video)
    feature['mean_video_num'] = np.mean(count_video)
    feature['std_video_num'] = np.std(count_video)
    
    count_author = list(Counter(group['author_id']).values())
    feature['min_author_num'] = np.min(count_author)
    feature['mean_author_num'] = np.mean(count_author)
    feature['std_author_num'] = np.std(count_author)
    
    # user操作不同作者不同视屏个数的统计值
    diff_video = list(Counter(group.drop_duplicates(subset=['author_id', 'video_id'])['author_id']).values())
    feature['min_video_per_author'] = np.min(diff_video)
    feature['max_video_per_author'] = np.max(diff_video)
    feature['mean_video_per_author'] = np.mean(diff_video)
    feature['std_video_per__author'] = np.std(diff_video)
    return feature
def add_feature(train_register, train_create, train_act):
    train = add_register_feature(train_register)
    print('add register表特征提取完毕')
    
    cre_feature = train_create.groupby('user_id', sort=True).apply(add_create_feature)
    cre_feature = cre_feature.reset_index()
    train = pd.merge(train, cre_feature,
                      on='user_id', how='left')
    print('add create表特征提取完毕')

    act_feature = train_act.groupby('user_id', sort=True).apply(add_activity_feature)
    act_feature = act_feature.reset_index()
    train = pd.merge(train, act_feature,
                      on='user_id', how='left')
    print('add activity表特征提取完毕')
    return train

In [None]:
%%time
train_reg1, train_cre1, train_lau1, train_act1 = cut_data(register, create, launch, activity,1, 16, 1, 16, 17, 23)
add_train1 = add_feature(train_reg1, train_cre1, train_act1)
add_train1.to_hdf('../tmp_data/ADD.f5', key='add_train1', complevel=9)
del train_reg1, train_cre1, train_lau1, train_act1 , add_train1
gc.collect()

train_reg2, train_cre2, train_lau2, train_act2 = cut_data(register, create, launch, activity,1, 23, 8, 23, 24, 30)
add_train2 = add_feature(train_reg2, train_cre2, train_act2)
add_train2.to_hdf('../tmp_data/ADD.f5', key='add_train2', complevel=9)
del train_reg2, train_cre2, train_lau2, train_act2, add_train2
gc.collect()

test_reg, test_cre, test_lau, test_act = cut_data(register, create, launch, activity,1, 30, 15, 30)
add_test = add_feature(test_reg, test_cre, test_act)
add_test.to_hdf('../tmp_data/ADD.f5', key='add_test', complevel=9)
del test_reg, test_cre, test_lau, test_act, add_test


del register, create, launch, activity
gc.collect()

# 生成与label距离的活跃个数特征

In [None]:
def get_dis_cre_fea(group, label_day):
    feature = pd.Series()
    group['cre_to_lab'] = label_day - group['create_day']
    feature['count_cre_within_1_day_from_label'] = np.sum(group['cre_to_lab'] <= 1)
    feature['count_cre_within_3_day_from_label'] = np.sum(group['cre_to_lab'] <= 3)
    feature['count_cre_within_5_day_from_label'] = np.sum(group['cre_to_lab'] <= 5)
    feature['count_cre_within_7_day_from_label'] = np.sum(group['cre_to_lab'] <= 7)
    feature['count_cre_within_9_day_from_label'] = np.sum(group['cre_to_lab'] <= 9)
    feature['count_cre_within_11_day_from_label'] = np.sum(group['cre_to_lab'] <= 11)
    feature['count_cre_within_13_day_from_label'] = np.sum(group['cre_to_lab'] <= 13)
    return feature

def get_dis_lau_fea(group, label_day):
    feature = pd.Series()
    group['lau_to_lab'] = label_day - group['launch_day']
    feature['count_lau_within_1_day_from_label'] = np.sum(group['lau_to_lab'] <= 1)
    feature['count_lau_within_3_day_from_label'] = np.sum(group['lau_to_lab'] <= 3)
    feature['count_lau_within_5_day_from_label'] = np.sum(group['lau_to_lab'] <= 5)
    feature['count_lau_within_7_day_from_label'] = np.sum(group['lau_to_lab'] <= 7)
    feature['count_lau_within_9_day_from_label'] = np.sum(group['lau_to_lab'] <= 9)
    feature['count_lau_within_11_day_from_label'] = np.sum(group['lau_to_lab'] <= 11)
    feature['count_lau_within_13_day_from_label'] = np.sum(group['lau_to_lab'] <= 13)
    return feature

def get_dis_act_fea(group, label_day):
    feature = pd.Series()
    group['act_to_lab'] = label_day - group['act_day']
    feature['count_act_within_1_day_from_label'] = np.sum(group['act_to_lab'] <= 1)
    feature['count_act_within_3_day_from_label'] = np.sum(group['act_to_lab'] <= 3)
    feature['count_act_within_5_day_from_label'] = np.sum(group['act_to_lab'] <= 5)
    feature['count_act_within_7_day_from_label'] = np.sum(group['act_to_lab'] <= 7)
    feature['count_act_within_9_day_from_label'] = np.sum(group['act_to_lab'] <= 9)
    feature['count_act_within_11_day_from_label'] = np.sum(group['act_to_lab'] <= 11)
    feature['count_act_within_13_day_from_label'] = np.sum(group['act_to_lab'] <= 13)
    return feature

def dis_all_fea(train_register, train_create, train_launch, train_act, label_day):
    dis_cre_fea = train_create.groupby('user_id', sort=True).apply(get_dis_cre_fea, (label_day,))
    dis_cre_fea = dis_cre_fea.reset_index()

    dis_lau_fea = train_launch.groupby('user_id', sort=True).apply(get_dis_lau_fea, (label_day,))
    dis_lau_fea = dis_lau_fea.reset_index()

    dis_act_fea = train_act.groupby('user_id', sort=True).apply(get_dis_act_fea, (label_day,))
    dis_act_fea = dis_act_fea.reset_index()
    
    dis_all_fea = train_register[['user_id']].merge(dis_cre_fea, on='user_id', how='left').\
                                    merge(dis_lau_fea, on='user_id', how='left').\
                                    merge(dis_act_fea, on='user_id', how='left')
    return dis_all_fea

In [None]:
%%time 
#--------------------------------------------cut1---------------------------------------------------------train_reg1, train_cre1, train_lau1, train_act1 = cut_data(register, create, launch, activity,1, 16, 1, 16, 17, 23)
train_reg1, train_cre1, train_lau1, train_act1 = cut_data(register, create, launch, activity,1, 16, 1, 16, 17, 23)
dis_train1 = dis_all_fea(train_reg1, train_cre1, train_lau1, train_act1, 17)
dis_train1.to_hdf('../tmp_data/cut1.h5', key='dis_train1', complevel=9, complib='zlib')
del train_reg1, train_cre1, train_lau1, train_act1, dis_train1
gc.collect()

train_reg2, train_cre2, train_lau2, train_act2 = cut_data(register, create, launch, activity,1, 23, 8, 23, 24, 30)
dis_train2 = dis_all_fea(train_reg2, train_cre2, train_lau2, train_act2, 24)
dis_train2.to_hdf('../tmp_data/cut1.h5', key='dis_train2', complevel=9, complib='zlib')
del train_reg2, train_cre2, train_lau2, train_act2, dis_train2
gc.collect()

test_reg, test_cre, test_lau, test_act = cut_data(register, create, launch, activity,1, 30, 15, 30)
dis_test = dis_all_fea(test_reg, test_cre, test_lau, test_act, 31)
dis_test.to_hdf('../tmp_data/cut1.h5', key='dis_test', complevel=9, complib='zlib')

del test_reg, test_cre, test_lau, test_act, dis_test
del register, create, launch, activity
gc.collect()

# add hcc

In [None]:
# add hcc inter fea
def binner(key, alldata,  maxbins = 101, na = -100, percent_per_bin = 1):
    raw_column = alldata[key].copy()
    raw_column.fillna(na, inplace = True)
    akey = raw_column[raw_column != na]
    count = len(akey.unique())
    if count < maxbins:
        return (alldata[key], None)
    try:
        bins = np.unique(np.percentile(akey, np.arange(0, 100, percent_per_bin)))
        # Add a bin for NA
        if np.min(raw_column) == na:
            bins = np.insert(bins, 0, na + 1)
        count = len(bins)
        binned_column = np.digitize(raw_column, bins)
        binned_column = [key + "_" + str(x) for x in binned_column]
        return (binned_column)
    except:
        return (raw_column)        
def add_cate(alldata, num2cate=[], tobin=[]):
    if tobin != []:
        for col in tobin:
            alldata['binned_' + col] = binner(col, alldata)
        bin = [i for i in alldata.columns if 'binned' in i]
    else:
        bin = []
    if num2cate != []:
        for col in num2cate:
            alldata[col] = alldata[col].map(lambda x: col + '_' +  str(x))
    cate = ['device_type', 'register_type', ]
    
    all_cate = cate + num2cate + bin
    #two-way interactions
    combi = list(combinations(all_cate, 2))
    inter2fea = []
    for (i, j) in combi:
        ij = i + "_" + j
        inter2fea.append(ij)
        alldata[ij] = alldata[i].astype(str) + "_" + alldata[j].astype(str)
      
    # three-way interactions
    combi_three_way = list(combinations(all_cate, 3))
    inter3fea = []
    if len(all_cate) >= 3: 
        for (i, j, k) in combi_three_way:
            ijk = i + "_" + j + "_" + k
            inter3fea.append(ijk)
            alldata[i + "_" + j + "_" + k] = alldata[i].astype(str) + "_" + alldata[j].astype(str) + "_" + alldata[k].astype(str)
    
    all_cate = all_cate + inter2fea + inter3fea   
    """trans_cols = [i for i in cate_cols if i not in cate]
    cate_map = []
    for i in trans_cols:
        key = alldata[i].value_counts().keys()
        value = range(len(key))
        dic = dict(zip(key, value))
        cate_map.append(dic)
    
    for i in range(len(trans_cols)):
        alldata[trans_cols[i]] = alldata[trans_cols[i]].map(cate_map[i])"""
    return alldata, all_cate
#-----------------------get hcc fea------------------------------------
def hcc_encode(train_df, test_df, variable, target, prior_prob, k, f=1, g=1, r_k=None, update_df=None):
    """
    See "A Preprocessing Scheme for High-Cardinality Categorical Attributes in
    Classification and Prediction Problems" by Daniele Micci-Barreca
    """
    hcc_name = "_".join(["hcc", variable, target])
    grouped = train_df.groupby(variable)[target].agg({"size": "size", "mean": "mean"})
    grouped["lambda"] = 1 / (g + np.exp((k - grouped["size"]) / f))
    grouped[hcc_name] = grouped["lambda"] * grouped["mean"] + (1 - grouped["lambda"]) * prior_prob
    df = test_df[[variable]].join(grouped, on=variable, how="left")[hcc_name].fillna(prior_prob)
    np.random.seed(1011)
    if r_k: 
        df *= np.random.uniform(1 - r_k, 1 + r_k, len(test_df))     # Add uniform noise. Not mentioned in original paper
    if update_df is None:
        update_df = test_df
    if hcc_name not in update_df.columns: 
        update_df[hcc_name] = np.nan
    update_df.update(df)
    return 
def add_hcc(train, test, cate_cols):
    skf = StratifiedKFold(10)
    prior = train['label'].mean()
    attributes = product(set(cate_cols), [('label', prior)])
    for variable, (target, prior) in attributes:
        hcc_encode(train, test, variable, target, prior, k=5, r_k=None)
        for tra, tes in skf.split(np.zeros(len(train)), train['label']):
            hcc_encode(train.iloc[tra], train.iloc[tes], variable, target, prior, k=5, r_k=0.01, update_df=train)
    obejct_col = [f for f in train.columns if train[f].dtype == "object"]
    train.drop(obejct_col, axis=1, inplace=True)
    test.drop(obejct_col, axis=1, inplace=True)
    return train, test

In [None]:
%%time
train1 = pd.read_hdf('../tmp_data/cut_train1.h5', key='train1')
#dis_train1 = pd.read_hdf('dis.h5', key='dis_train1')
#train1 = train1.merge(dis_train1, how='left', on='user_id')
train2 = pd.read_hdf('../tmp_data/cut_train2.h5', key='train2')
#dis_train2 = pd.read_hdf('dis.h5', key='dis_train2')
#train2 = train2.merge(dis_train2, how='left', on='user_id')
train = pd.concat([train1, train2], ignore_index=True, axis=0)
test = pd.read_hdf('../tmp_data/cut_test.h5', key='test')
#dis_test = pd.read_hdf('dis.h5', key='dis_test')
#test = test.merge(dis_test, how='left', on='user_id')
del train1, train2
gc.collect()


In [None]:
%%time
train = train.set_index('user_id')
test = test.set_index('user_id')
train["train_flag"] = 1
test["train_flag"] = 0
test["label"] = -1
alldata = pd.concat([train, test], axis = 0)

#num2cate =  ['min_of_launch_days_to_label', 'max_launch_day_cut_regis']
#tobin = ['count_act_within_1_day_from_label', '1_action_count_div_sum', '3_page_count_div_sum',
#        'mean_of_launch_days_to_label', '1_page_count_div_sum']
alldata, cate_cols= add_cate(alldata, \
 num2cate=[], tobin=[])
print(cate_cols)
print('done')
train_processed = alldata[alldata["train_flag"] == 1].copy()
test_processed = alldata[alldata["train_flag"] == 0].copy()
train_processed, test_processed = add_hcc(train_processed, test_processed, cate_cols)
X_train = train_processed.drop(['label', 'train_flag'], axis=1)
Y_train = train_processed['label']
X_test = test_processed.drop(['train_flag', 'label'], axis=1)    
X_test.shape

In [None]:
X_train.reset_index()[['user_id', 'hcc_device_type_label', 'hcc_register_type_label', 'hcc_device_type_register_type_label']].to_hdf('hcc.h5', key='hcc_train', complevel=9, complib='zlib')
X_test.reset_index()[['user_id', 'hcc_device_type_label', 'hcc_register_type_label','hcc_device_type_register_type_label']].to_hdf('hcc.h5', key='hcc_test', complevel=9, complib='zlib')

# add2

In [None]:
def add_register_feature(train_register):
    # 注册类型
    # 设备类型
    # 'register_type', 'device_type', 'register_day_cut_max_day',
    return train_register[['user_id']]

def add_activity_feature2(group): 
    feature = pd.Series()
    act_days = np.unique(group['act_day'])
    uni_vd = np.unique(group['video_id'])
    uni_au = np.unique(group['author_id'])
    
    day_vd = {} #字典存放user 某天 操作的视频列表
    for i in act_days:
        day_vd[i] = group[group['act_day'] == i]['video_id']
        
    day_max_vd = {} # 字典存放user 每天最多操作同一视频多少次
    day_mean_vd = {} # 字典存放user 每天平均重复操作每个视频多少次
    for i in act_days:
        day_max_vd[i] = max(list(Counter(day_vd[i]).values()))
        day_mean_vd[i] = np.mean(list(Counter(day_vd[i]).values()))
    feature['max_act4vd_alldaymax'] = np.max(list(day_max_vd.values())) # 所有天中单日最多操作同一视频多少次
    feature['mean_act4vd_alldaymax'] = np.mean(list(day_max_vd.values())) # 所有天中平均单日最多操作同一视频多少次
    feature['max_act4vd_alldaymean'] = np.max(list(day_mean_vd.values())) # 最多单日平均重复操作每个视频多少次
    feature['mean_act4vd_alldaymean'] = np.mean(list(day_mean_vd.values())) # 平均每天平均操作同一视频多少次
    
    vd_day_count = {} #字典存放用户操作的所有video共在多少天出现过
    for i in uni_vd:
        vd_day_count[i] = 0
        for j in day_vd.keys():
            if i in day_vd[j]:
                vd_day_count[i] += 1
    feature['max_count4_per_vd'] = max(list(vd_day_count.values()))
    feature['mean_count4_per_vd'] = np.mean(list(vd_day_count.values()))
    
                                                
    day_au = {} #字典存放user 某天 操作的作者列表
    for i in act_days:
        day_au[i] = group[group['act_day'] == i]['author_id']
                                                
    day_max_au = {} # 字典存放user 每天最多操作同一作者多少次
    day_mean_au = {} # 字典存放user 每天平均重复操作每个作者多少次
    for i in act_days:
        day_max_au[i] = max(list(Counter(day_au[i]).values()))
        day_mean_au[i] = np.mean(list(Counter(day_au[i]).values()))                                                
    feature['max_act4au_alldaymax'] = np.max(list(day_max_au.values())) # 所有天中单日最多操作同一作者多少次
    feature['mean_act4au_alldaymax'] = np.mean(list(day_max_au.values())) # 所有天中平均单日最多操作同一作者多少次
    feature['max_act4au_alldaymean'] = np.max(list(day_mean_au.values())) # 最多单日平均重复操作每个作者多少次
    feature['mean_act4au_alldaymean'] = np.mean(list(day_mean_au.values())) # 平均每天平均操作同一作者多少次
    
    au_day_count = {} #  #字典存放用户操作的所有author共在多少天出现过
    for i in uni_au:
        au_day_count[i] = 0
        for j in day_au.keys():
            if i in day_au[j]:
                au_day_count[i] += 1
    feature['max_count4_per_au'] = max(list(au_day_count.values()))
    feature['mean_count4_per_au'] = np.mean(list(au_day_count.values()))
    return feature
def add_feature2(train_register, train_create, train_act):
    train = add_register_feature(train_register)
    print('add2 register表特征提取完毕')
    
    act_feature = train_act.groupby('user_id', sort=True).apply(add_activity_feature2)
    act_feature = act_feature.reset_index()
    train = pd.merge(train, act_feature,
                      on='user_id', how='left')
    print('add2 activity表特征提取完毕')
    return train

In [None]:
%%time
train_reg1, train_cre1, train_lau1, train_act1 = cut_data(register, create, launch, activity,1, 16, 1, 16, 17, 23)
add2_train1 = add_feature2(train_reg1, train_cre1, train_act1)
add2_train1.to_hdf('../tmp_data/ADD.f5', key='add2_train1', complevel=9)
del train_reg1, train_cre1, train_lau1, train_act1, add2_train1
gc.collect()

train_reg2, train_cre2, train_lau2, train_act2 = cut_data(register, create, launch, activity,1, 23, 8, 23, 24, 30)
add2_train2 = add_feature2(train_reg2, train_cre2, train_act2)
add2_train2.to_hdf('../tmp_data/ADD.f5', key='add2_train2', complevel=9)
del train_reg2, train_cre2, train_lau2, train_act2, , add2_train2
gc.collect()

test_reg, test_cre, test_lau, test_act = cut_data(register, create, launch, activity,1, 30, 15, 30)
add2_test = add_feature2(test_reg, test_cre, test_act)
add2_test.to_hdf('../tmp_data/ADD.f5', key='add2_test', complevel=9)
del test_reg, test_cre, test_lau, test_act, add2_test


del register, create, launch, activity
gc.collect()

# put my feature together

In [None]:
ls -lh

In [None]:
ad = pd.HDFStore('ADD.f5')

In [None]:
%%time
import pandas as pd
train1 = pd.read_hdf('../tmp_data/cut_train1.h5', key='train1')
dis_train1 = pd.read_hdf('../tmp_data/dis.h5', key='dis_train1')
train1 = train1.merge(dis_train1, how='left', on='user_id')
add_train1 = pd.read_hdf('../tmp_data/ADD.f5', key='add_train1')
train1 = train1.merge(add_train1, how='left', on='user_id')
add2_train1 = pd.read_hdf('../tmp_data/ADD.f5', key='add2_train1')
train1 = train1.merge(add2_train1, how='left', on='user_id')

train2 = pd.read_hdf('../tmp_data/cut_train2.h5', key='train2')
dis_train2 = pd.read_hdf('../tmp_data/dis.h5', key='dis_train2')
train2 = train2.merge(dis_train2, how='left', on='user_id')
add_train2 = pd.read_hdf('../tmp_data/ADD.f5', key='add_train2')
train2 = train2.merge(add_train2, how='left', on='user_id')
add2_train2 = pd.read_hdf('../tmp_data/ADD.f5', key='add2_train2')
train2 = train2.merge(add2_train2, how='left', on='user_id')

train = pd.concat([train1, train2], axis=0).set_index('user_id')

hcc_train = pd.read_hdf('../tmp_data/hcc.h5', key='hcc_train').set_index('user_id')
train = pd.concat([train, hcc_train], axis=1).reset_index()

test = pd.read_hdf('../tmp_data/cut_test.h5', key='test')
dis_test = pd.read_hdf('../tmp_data/dis.h5', key='dis_test')
test = test.merge(dis_test, how='left', on='user_id')#.set_index('user_id')
add_test = pd.read_hdf('../tmp_data/ADD.f5', key='add_test')
test = test.merge(add_test, how='left', on='user_id')#.set_index('user_id')
add2_test = pd.read_hdf('../tmp_data/ADD.f5', key='add2_test')
test = test.merge(add2_test, how='left', on='user_id').set_index('user_id')

hcc_test = pd.read_hdf('../tmp_data/hcc.h5', key='hcc_test').set_index('user_id')
test = pd.concat([test, hcc_test], axis=1).reset_index()



"""lgbtrain_pred_fea = pd.read_hdf('../input/pred_fea.h5', key='lgbtrain_pred_fea')
lgbtest_pred_fea = pd.read_hdf('../input/pred_fea.h5', key='lgbtest_pred_fea')
train = pd.concat([train, lgbtrain_pred_fea], axis=1)
test = pd.concat([test, lgbtest_pred_fea], axis=1)"""

del train1, train2
del add_train1, add_train2, add_test
del add2_train1, add2_train2, add2_test
del hcc_train, hcc_test, dis_train1, dis_train2, dis_test

gc.collect()

for i in list(test):
    if any(t in i for t in ['gap4', '2regis']):
        train[i] = train[i].fillna(-1)
        test[i] = test[i].fillna(-1)
    elif '2label' in i:
        train[i]= train[i].fillna(17)
        test[i] = test[i].fillna(17)
    else:
        train[i] = train[i].fillna(0)
        test[i] = test[i].fillna(0)
print(train.shape)
print(test.shape)

In [None]:
train_int = train.select_dtypes(include=['int'])
train_int =  train_int.apply(pd.to_numeric, downcast='unsigned')

train_float = train.select_dtypes(include=['float'])
train_float = train_float.apply(lambda x : np.around(x, 5)).apply(pd.to_numeric, downcast='unsigned')

train[train_int.columns] = train_int
train[train_float.columns] = train_float
del train_int, train_float
gc.collect()

In [None]:
test_int = test.select_dtypes(include=['int'])
test_int =  test_int.apply(pd.to_numeric, downcast='unsigned')

test_float = test.select_dtypes(include=['float'])
test_float = test_float.apply(lambda x : np.around(x, 5)).apply(pd.to_numeric, downcast='unsigned')

test[test_int.columns] = test_int
test[test_float.columns] = test_float
del test_int, test_float
gc.collect()

In [None]:
train.to_hdf('../tmp_data/train.h5', key='train', complevel=9)
test.to_hdf('../tmp_data/test.h5', key='test', complevel=9)