# 加上两个用户行为特征

*添加用户行为特征，对于item和分类的访问次数*

*超参搜索，找出不容易过拟合的参数*


In [1]:
import sys
sys.path.append("..")
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

%matplotlib inline

train_df = pd.read_table('../../round1_ijcai_18_train_20180301.txt',sep=' ')
test_df = pd.read_table('../../round1_ijcai_18_test_a_20180301.txt',sep=' ')

# 线下线上数据统一进行特征处理
test_df['is_trade'] = -1
total_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# 1 先处理时序数据

In [2]:
# 时间处理: 分离天, 星期几, 上中下午/晚上, 小时数
# date最终不使用，直接用day(第 0 - 7 天)

from sklearn import preprocessing
from datetime import datetime
le = preprocessing.LabelEncoder()

def extract_date(x):
    d = datetime.fromtimestamp(x)
    return d.strftime('%Y-%m-%d')
def extract_weekday(x):
    d = datetime.fromtimestamp(x)
    return d.weekday()
def extract_hour(x):
    d = datetime.fromtimestamp(x)
    return d.hour

total_df['date'] = total_df['context_timestamp'].apply(lambda x: extract_date(x))
total_df['day'] = le.fit_transform(total_df['date'])
total_df['weekday'] = total_df['context_timestamp'].apply(lambda x: extract_weekday(x))
total_df['hour'] = total_df['context_timestamp'].apply(lambda x: extract_hour(x))

# 2 先分离训练集, 测试集, 线上集



In [11]:
# 获取训练测试的索引, 6全集数据验证, 7为生成上线文件
import numpy as np
test_day= 6
starts = list(range(0,test_day))
ends = [test_day]
train_indices = total_df[total_df['day'].isin(starts)].index.values
test_indices = total_df[total_df['day'].isin(ends)].index.values
print(train_indices.shape, test_indices.shape)

tmp_df = total_df.copy()

# 把测试索引的label提取
y_test = tmp_df.iloc[test_indices]['is_trade']

# 把需要训练和测试的数据提取, 并且强制去掉测试集的label列
tmp_df.loc[test_indices, 'is_trade'] = np.nan

# 得出训练测试必须的数据集 ,并且添加一列data_set作为标记
train_tmp = tmp_df.iloc[train_indices].copy()
train_tmp['data_set'] = 'training'
test_tmp = tmp_df.iloc[test_indices].copy()
test_tmp['data_set'] = 'testing'
raw_df = train_tmp.append(test_tmp)


(420717,) (57421,)


### 重复列特征

In [12]:
%run ../util/time_utils.py
dup_feat = ['item_id', 'item_brand_id', 'shop_id', 'user_id']
raw_df = generateColDupByDay(raw_df, dup_feat, list(range(1, 8)), verbose=False)

### 交易率特征选择

*固定smooth为10*

1. 特征是否独立提升?
2. 平滑是否不会改变正负向?

##### 正向特征

> item_city_id, shop_id, user_gender_id, item_sales_level, item_collected_level, shop_review_num_level

##### 负向特征

> item_price_level,
item_id,item_brand_id,item_pv_level,user_age_level,
user_occupation_id,user_star_level,context_page_id,
shop_review_positive_rate,shop_star_level,
shop_score_service,shop_score_description,day,hour


In [13]:
# 计算前一天的交易率set到下一天，第0天用回自己
# %run ../util/time_utils.py

# setTradeRateByDate(raw_df, ['item_city_id', 'item_id', 'item_brand_id', 'shop_id', 'user_id'])

# 计算前一天的交易率set到下一天，第0天用回自己
%run ../util/trade_info.py

# trade_rela = ['item_city_id', 'shop_id', 'user_gender_id', 'item_sales_level', 'item_collected_level', 'shop_review_num_level']
trade_rela = ['item_city_id', 'item_id', 'item_brand_id', 'shop_id', 'user_id']

# colSm = {}
# for col in trade_rela:
#     colSm[col] = [15*(mean0) , 15]
generateTradeRateByDate(raw_df, trade_rela, 7, None, verbose=False, glbSmoothing=200, glbMean0=0.05)
print(raw_df.shape)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


(478138, 71)


### 复合类型拆解

In [14]:
%run ../util/complex_type.py
from sklearn import preprocessing
from datetime import datetime
le = preprocessing.LabelEncoder()
raw_df = process_complex_types(raw_df, get_icl_map(raw_df), get_ipl_map(raw_df))

raw_df['item_category_1'] = le.fit_transform(raw_df.item_category_1)
raw_df['item_category_2'] = le.fit_transform(raw_df.item_category_2)

get_icl_map ... 
get_ipl_map ... 
processing predict_category_property ...
processing item_property_list ...
processing item_category_list ...
generating item_category_1, item_category_2 ...


In [15]:
%run ../util/complex_type.py

cnt_user_item_review = raw_df[["user_id", "item_id", "instance_id"]].groupby(["user_id", "item_id"])['instance_id'].count().to_dict() 
cnt_user_cate_review = raw_df[["user_id", "item_category_1", "instance_id"]].groupby(["user_id", "item_category_1"])['instance_id'].count().to_dict()  

f1 = set_review_cnt("user_id", "item_id", cnt_user_item_review)
f2 = set_review_cnt("user_id", "item_category_1", cnt_user_cate_review)

tmp = raw_df.sort_values(by="context_timestamp")
tmp["item_review_cnt"] = tmp[["user_id", "item_id"]].apply(f1, axis=1)
tmp["cate_review_cnt"] = tmp[["user_id", "item_category_1"]].apply(f2, axis=1)
raw_df = tmp.sort_index()


# 3 生成矩阵数据

In [16]:
train_df = raw_df[raw_df['data_set'] == 'training']
test_df = raw_df[raw_df['data_set'] == 'testing']

non_feat_columns = ['data_set', 'context_timestamp', 'instance_id', 'is_trade', 'context_id',
                   'item_property_list', 'item_category_list', 'date', 'predict_category_property',
                    'predict_richness', 'predict_category_property', 'item_property_richness', 'item_property_list', 
                    'item_category_list', 'item_category_1', 'item_category_2'
                   ]

D = train_df[['date']]
X_train = train_df.drop(non_feat_columns, axis=1)
y_train = train_df[['is_trade']].values.ravel()
X_test = test_df.drop(non_feat_columns, axis=1)
# y_test is already exists

# X_online = test_df.drop(non_feat_columns, axis=1).values
print(D.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(420717, 1) (420717, 64) (420717,) (57421, 64) (57421,)


In [17]:
# 训练模型
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import xgboost

#import lightgbm as lgb

%run ../util/time_series_split.py

# clf = lgb.LGBMClassifier(n_jobs=20)
clf = xgboost.XGBClassifier(n_jobs=7,max_depth=5,n_estimators=91,min_child_weight=5, max_delta_step=0.9,gamma=6.6)
# clf = xgboost.XGBClassifier(n_jobs=7, max_depth=3, n_estimators=230)

m = None

# 如果移动到线上集, 则输出模型
if sum(y_test == -1) > 0:
    m = clf.fit(X_train, y_train)
    result = pd.DataFrame()
    result['instance_id'] = test_df['instance_id']
    result['predicted_score'] = pd.DataFrame(m.predict_proba(X_test))[1].values
    result.to_csv('submits/7_7452_8095_a8170_b8063.csv', sep = ' ', header=True, index = False)
else:
    # 分离a,b榜
    X_val_a, X_val_b, y_val_a, y_val_b = train_test_split(X_test, y_test, test_size=0.7, shuffle=True, random_state=6)
    m = clf.fit(X_train, y_train)
    
    val_a_loss = log_loss(y_val_a, m.predict_proba(X_val_a))
    val_b_loss = log_loss(y_val_b, m.predict_proba(X_val_b))
    print('(%s -> %s) train logloss: %.5f, test logloss: %.5f, a: %.5f, b: %.5f' % \
          (starts, ends, \
           log_loss(y_train, m.predict_proba(X_train)), \
           log_loss(y_test, m.predict_proba(X_test)),\
          val_a_loss, val_b_loss))
    
# ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07348, test logloss: 0.08093, a: 0.08162, b: 0.08064
# ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07379, test logloss: 0.08092, a: 0.08160, b: 0.08063
# ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07424, test logloss: 0.08098, a: 0.08179, b: 0.08064
# ([0, 1, 2, 3, 4, 5] -> [6]) gamma6.6 train logloss: 0.07416, test logloss: 0.08097, a: 0.08177, b: 0.08063
# ([0, 1, 2, 3, 4, 5] -> [6]) +max_delta_step 0.9 train logloss: 0.07452, test logloss: 0.08095, a: 0.08170, b: 0.08063

([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08170, b: 0.08063


In [18]:
for i in range(1000):
    # 分离a,b榜
    X_val_a, X_val_b, y_val_a, y_val_b = train_test_split(X_test, y_test, test_size=0.7, shuffle=True, random_state=i)
    
    val_a_loss = log_loss(y_val_a, m.predict_proba(X_val_a))
    val_b_loss = log_loss(y_val_b, m.predict_proba(X_val_b))
    print('%d (%s -> %s) train logloss: %.5f, test logloss: %.5f, a: %.5f, b: %.5f' % \
          (i, starts, ends, \
           log_loss(y_train, m.predict_proba(X_train)), \
           log_loss(y_test, m.predict_proba(X_test)),\
          val_a_loss, val_b_loss))

0 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08475, b: 0.07932
1 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07702, b: 0.08264
2 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08397, b: 0.07966
3 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08060, b: 0.08110
4 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07776, b: 0.08232
5 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08443, b: 0.07946
6 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08170, b: 0.08063
7 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08504, b: 0.07920
8 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07795, b: 0.08224
9 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08523, b: 0.07912


82 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08020, b: 0.08127
83 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08394, b: 0.07967
84 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08006, b: 0.08133
85 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08000, b: 0.08136
86 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07662, b: 0.08281
87 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08495, b: 0.07924
88 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08401, b: 0.07964
89 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07765, b: 0.08237
90 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08069, b: 0.08106
91 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07730, b

163 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07568, b: 0.08321
164 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08408, b: 0.07961
165 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08106, b: 0.08091
166 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07937, b: 0.08163
167 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08242, b: 0.08032
168 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08195, b: 0.08052
169 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08108, b: 0.08090
170 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07943, b: 0.08160
171 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08385, b: 0.07971
172 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 

244 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08145, b: 0.08074
245 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07747, b: 0.08244
246 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08259, b: 0.08025
247 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07923, b: 0.08169
248 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07838, b: 0.08205
249 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08616, b: 0.07872
250 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08089, b: 0.08098
251 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08033, b: 0.08122
252 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08304, b: 0.08006
253 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 

325 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07686, b: 0.08270
326 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08376, b: 0.07975
327 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08287, b: 0.08013
328 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07962, b: 0.08152
329 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08137, b: 0.08077
330 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07936, b: 0.08163
331 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08232, b: 0.08036
332 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.08200, b: 0.08050
333 ([0, 1, 2, 3, 4, 5] -> [6]) train logloss: 0.07452, test logloss: 0.08095, a: 0.07927, b: 0.08167


KeyboardInterrupt: 

# 4 超参搜索

In [23]:
from sklearn.model_selection import ParameterGrid
# 训练模型
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import xgboost

params={
    'max_depth': [5],#[6,10,15,20], # 5 is good but takes too long in kaggle env
    'subsample': [1],#[1,0.9,0.8,0.7],#[0.1,0.2,0.3,0.4,0.5,0.55,0.6,0.65,0.7,0.8,0.9],#[0.9]
    'colsample_bytree': [1],#[0.1,0.2,0.3,0.4,0.5,0.6,0.65,0.7,0.75,0.8,0.9,0.95],#[0.9],
    'colsample_bylevel':[1],#[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
    'scale_pos_weight':[1],#[0.7,0.8,1,1.1,1.2],#0.1,0.5,1,2,5
    'max_delta_step':[0.9],#[0,0.1,0.001,0.05,0.005,0.5,1,2],
    'n_estimators': [91],#[100,90,91,92,93,94,95,96,97,98,99],#[200,230,260,270,280,290,300]
    'reg_lambda': [1],#[1.3,1.4,1.5,1.6],
    'reg_alpha': [0],#[0,0.01,0.1,0.02,0.2],#[0.01,0.05,0.005,0.2,0.1,0.02,0.0001,0],#
    'min_child_weight':[5],#[7,8,9,10,11,12,13,14,15,16,17,18,19,20],#[23],#[20,21,22,23,24,25,26],#
    'gamma':[6.6],#[6],#[0.1],#
    'learning_rate':[0.1],#[0.01,0.015,0.02,0.025,0.05,0.005,0.1],#[0.02]#
}

# if sum(y_test == -1) == 0:
#     xgb = xgboost.XGBClassifier(n_jobs=7)
#     best_score = 1 
#     for g in ParameterGrid(params):
#         xgb.set_params(**g)

#         # 分离a,b榜
#         X_val_a, X_val_b, y_val_a, y_val_b = train_test_split(X_test, y_test, test_size=0.7, shuffle=True, random_state=6)
#         m = xgb.fit(X_train, y_train, eval_metric='logloss')

#         val_train_loss = log_loss(y_train, m.predict_proba(X_train))
#         val_test_loss = log_loss(y_test, m.predict_proba(X_test))
#         val_a_loss = log_loss(y_val_a, m.predict_proba(X_val_a))
#         val_b_loss = log_loss(y_val_b, m.predict_proba(X_val_b))
#         print('-'*80)
#         print(g)
#         print('(%s -> %s) train logloss: %.5f, test logloss: %.5f, a: %.5f, b: %.5f' % \
#               (starts, ends, val_train_loss, val_test_loss, val_a_loss, val_b_loss))

#         # save if best
#         if val_test_loss < best_score:
#             best_score = val_test_loss
#             best_grid = g

#     print('-'*80, '\n')        
#     print ("log loss: %0.5f" % best_score )
#     print ("Grid:", best_grid)

In [24]:
from datetime import datetime
print(datetime.now())

2018-04-03 08:28:22.683362
