### 项目目的
目标：根据用户的历史行为记录，预测用户在接下来一天对商品子集的购买行为预测结果

评估指标：Recall和F1

### 数据加载

In [3]:
import pandas as pd
train_user = pd.read_csv('../../dataset/mobile_recommend/tianchi_mobile_recommend_train_user.csv')
train_item = pd.read_csv('../../dataset/mobile_recommend/tianchi_mobile_recommend_train_item.csv')

### 数据查看

In [4]:
train_user.describe()

Unnamed: 0,user_id,item_id,behavior_type,item_category
count,12256910.0,12256910.0,12256910.0,12256910.0
mean,71707320.0,202308400.0,1.105271,6846.162
std,41229200.0,116739700.0,0.4572662,3809.922
min,4913.0,64.0,1.0,2.0
25%,35849650.0,101413000.0,1.0,3721.0
50%,72928040.0,202135900.0,1.0,6209.0
75%,107377400.0,303540500.0,1.0,10290.0
max,142455900.0,404562500.0,4.0,14080.0


In [13]:
train_user.count()

user_id          12256906
item_id          12256906
behavior_type    12256906
user_geohash      3922082
item_category    12256906
time             12256906
dtype: int64

In [14]:
train_user.columns

Index(['user_id', 'item_id', 'behavior_type', 'user_geohash', 'item_category',
       'time'],
      dtype='object')

In [15]:
train_user.head(10)

Unnamed: 0,user_id,item_id,behavior_type,user_geohash,item_category,time
0,98047837,232431562,1,,4245,2014-12-06 02
1,97726136,383583590,1,,5894,2014-12-09 20
2,98607707,64749712,1,,2883,2014-12-18 11
3,98662432,320593836,1,96nn52n,6562,2014-12-06 10
4,98145908,290208520,1,,13926,2014-12-16 21
5,93784494,337869048,1,,3979,2014-12-03 20
6,94832743,105749725,1,,9559,2014-12-13 20
7,95290487,76866650,1,,10875,2014-11-27 16
8,96610296,161166643,1,,3064,2014-12-11 23
9,100684618,21751142,3,,2158,2014-12-05 23


In [17]:
train_user.user_geohash.describe()

count     3922082
unique     575458
top       94ek6ke
freq         1052
Name: user_geohash, dtype: object

### 从用户训练集表中，可以得到以下信息：
1.存在6个字段，分别为'user_id', 'item_id', 'behavior_type', 'user_geohash', 'item_category','time'，对应用户id、商品id、行为类型、用户地理位置、商品目录、时间

2.用户行为分为4类，值分别为1、2、3、4,对应浏览、收藏、加购物车、购买

3.地理位置缺失值较多,应如何处理


In [5]:
train_item.describe()

Unnamed: 0,item_id,item_category
count,480723.0,480723.0
mean,198319600.0,6790.729437
std,118097600.0,3404.598754
min,2651.0,2.0
25%,90322190.0,4533.0
50%,208834800.0,6648.0
75%,295694200.0,9933.0
max,404562400.0,14071.0


In [6]:
train_item.columns

Index(['item_id', 'item_geohash', 'item_category'], dtype='object')

In [7]:
train_item.head()

Unnamed: 0,item_id,item_geohash,item_category
0,312051294,,8270
1,99999754,,7393
2,131746128,,7876
3,385731330,,10544
4,100004415,,3064


In [49]:
# 将文本格式的时间转换为时间戳格式，方便计算
train_user.time = pd.to_datetime(train_user.time)

Unnamed: 0,user_id,item_id,behavior_type,user_geohash,item_category,time
0,98047837,232431562,1,,4245,2014-12-06 02:00:00
1,97726136,383583590,1,,5894,2014-12-09 20:00:00
2,98607707,64749712,1,,2883,2014-12-18 11:00:00
3,98662432,320593836,1,96nn52n,6562,2014-12-06 10:00:00
4,98145908,290208520,1,,13926,2014-12-16 21:00:00


### 数据探索

In [66]:
behavior_type_dict = {1:'browse', 2:'collection',3:'cart',4:'buy'}
behavior_type_transform = train_user.behavior_type.apply(lambda x:behavior_type_dict.get(x))

In [78]:
behavior_type_transform.describe()

count     12256906
unique           4
top         browse
freq      11550581
Name: behavior_type, dtype: object

In [79]:
train_user['behavior_type_transform'] = behavior_type_transform

In [80]:
train_user.head(10)

Unnamed: 0,user_id,item_id,behavior_type,user_geohash,item_category,time,behavior_type_transform
0,98047837,232431562,1,,4245,2014-12-06 02:00:00,browse
1,97726136,383583590,1,,5894,2014-12-09 20:00:00,browse
2,98607707,64749712,1,,2883,2014-12-18 11:00:00,browse
3,98662432,320593836,1,96nn52n,6562,2014-12-06 10:00:00,browse
4,98145908,290208520,1,,13926,2014-12-16 21:00:00,browse
5,93784494,337869048,1,,3979,2014-12-03 20:00:00,browse
6,94832743,105749725,1,,9559,2014-12-13 20:00:00,browse
7,95290487,76866650,1,,10875,2014-11-27 16:00:00,browse
8,96610296,161166643,1,,3064,2014-12-11 23:00:00,browse
9,100684618,21751142,3,,2158,2014-12-05 23:00:00,cart


In [98]:
train_user['browse'] = train_user.behavior_type.apply(lambda x: 1 if x==1 else 0)
train_user['collection'] = train_user.behavior_type.apply(lambda x: 1 if x==2 else 0)
train_user['cart'] = train_user.behavior_type.apply(lambda x: 1 if x==3 else 0)
train_user['buy'] = train_user.behavior_type.apply(lambda x: 1 if x==4 else 0)

In [106]:
train_user_count = train_user.groupby(['user_id', 'item_id'])['browse','collection','cart','buy'].sum().reset_index()

In [107]:
train_user_count.describe()

Unnamed: 0,user_id,item_id,browse,collection,cart,buy
count,4686904.0,4686904.0,4686904.0,4686904.0,4686904.0,4686904.0
mean,71564010.0,202369500.0,2.464437,0.05175186,0.07330297,0.02564699
std,41134160.0,116761200.0,2.327758,0.235627,0.3177246,0.198431
min,4913.0,64.0,0.0,0.0,0.0,0.0
25%,35758740.0,101475200.0,1.0,0.0,0.0,0.0
50%,72789010.0,202247200.0,2.0,0.0,0.0,0.0
75%,107223700.0,303637200.0,3.0,0.0,0.0,0.0
max,142455900.0,404562500.0,125.0,9.0,14.0,31.0


In [118]:
train_user_count.head()

Unnamed: 0,user_id,item_id,browse,collection,cart,buy
0,4913,315532,1,0,0,0
1,4913,876969,6,0,0,0
2,4913,2741340,1,0,0,0
3,4913,3007091,3,0,0,0
4,4913,4210607,1,0,0,0


In [111]:
train_user_count.shape

(4686904, 6)

In [113]:
train_user_count_item = train_user.groupby(['item_id'])['browse','collection','cart','buy'].sum().reset_index()

In [114]:
train_user_count_item.describe()

Unnamed: 0,item_id,browse,collection,cart,buy
count,2876947.0,2876947.0,2876947.0,2876947.0,2876947.0
mean,202286400.0,4.014874,0.08431021,0.1194196,0.04178214
std,116798300.0,8.44366,0.3448123,0.4773012,0.2908267
min,64.0,0.0,0.0,0.0,0.0
25%,101111300.0,1.0,0.0,0.0,0.0
50%,202203400.0,2.0,0.0,0.0,0.0
75%,303394800.0,4.0,0.0,0.0,0.0
max,404562500.0,1431.0,24.0,56.0,50.0


In [119]:
train_user_count_item.head()

Unnamed: 0,item_id,browse,collection,cart,buy
0,64,2,0,0,0
1,496,2,0,0,0
2,528,2,0,0,0
3,579,2,0,0,0
4,581,4,0,0,0


In [120]:
train_user_count_user = train_user.groupby(['user_id'])['browse','collection','cart','buy'].sum().reset_index()

In [122]:
train_user_count_user.describe()

Unnamed: 0,user_id,browse,collection,cart,buy
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,71448970.0,1155.0581,24.2556,34.3564,12.0205
std,41199490.0,1430.052774,73.900635,63.889429,19.050621
min,4913.0,1.0,0.0,0.0,0.0
25%,35471930.0,297.0,0.0,2.0,2.0
50%,72514220.0,703.0,2.0,12.0,7.0
75%,107305200.0,1461.0,18.0,39.0,15.0
max,142455900.0,27720.0,2935.0,1810.0,809.0


In [123]:
train_user_count_user.head()

Unnamed: 0,user_id,browse,collection,cart,buy
0,4913,1658,49,29,6
1,6118,112,4,0,1
2,7528,183,1,24,6
3,7591,824,0,14,21
4,12645,248,2,10,8
