In [4]:
import matplotlib.pyplot as plt
import pandas as pd

# 对pandas和matplotlib的显示设置
pd.set_option("display.max_columns", 30)
plt.rcParams.update({"font.family": "SimHei", "font.size": 14})
plt.style.use("tableau-colorblind10")
%matplotlib inline

In [5]:
# data_user_log = pd.read_csv("/home/mw/input/tmall_repurch6487/data/user_log_format1.csv")  # 初次导入数据时启用
data_user_info = pd.read_csv("../data/format1/data_format1/user_info_format1.csv")
data_train = pd.read_csv("../data/format1/data_format1/train_format1.csv")
data_test = pd.read_csv("../data/format1/data_format1/test_format1.csv")
d_types = {
    "user_id": "int32",
    "item_id": "int32",
    "cat_id": "int16",
    "seller_id": "int16",
    "brand_id": "float32",
    "time_stamp": "int16",
    "action_type": "int8",
}
data_user_log = pd.read_csv("../data/format1/data_format1/user_log_format1.csv", dtype=d_types)
# check tables
display(data_user_log.head(1))
display(data_user_info.head(1))
display(data_train.head(1))
display(data_test.head(1))

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0


Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0


Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0


Unnamed: 0,user_id,merchant_id,prob
0,163968,4605,


In [6]:
data_user_log.rename(columns={"seller_id": "merchant_id"}, inplace=True)
data_train["origin"] = "train"
data_test["origin"] = "test"
data = pd.concat([data_train, data_test], sort=False)
data = data.drop(["prob"], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 522341 entries, 0 to 261476
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      522341 non-null  int64  
 1   merchant_id  522341 non-null  int64  
 2   label        260864 non-null  float64
 3   origin       522341 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 19.9+ MB


In [7]:
data_user_info["age_range"].fillna(0, inplace=True)  # 0和null代表未知
data_user_info["gender"].fillna(0, inplace=True)  # 2和null代表未知

data_user_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    424170 non-null  int64  
 1   age_range  424170 non-null  float64
 2   gender     424170 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 9.7 MB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_user_info["age_range"].fillna(0, inplace=True)  # 0和null代表未知
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_user_info["gender"].fillna(0, inplace=True)  # 2和null代表未知


In [8]:
data_user_log.dtypes, data.dtypes

(user_id          int32
 item_id          int32
 cat_id           int16
 merchant_id      int16
 brand_id       float32
 time_stamp       int16
 action_type       int8
 dtype: object,
 user_id          int64
 merchant_id      int64
 label          float64
 origin          object
 dtype: object)

In [9]:
# 按user_id分组
groups = data_user_log.groupby(["user_id"])
# 统计交互总次数
temp = groups.size().reset_index().rename(columns={0: "u1"})
data = pd.merge(data, temp, on="user_id", how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1
0,34176,3906,0.0,train,451
1,34176,121,0.0,train,451
2,34176,4356,1.0,train,451


In [10]:
# 统计交互天数
temp = groups.time_stamp.nunique().reset_index().rename(columns={"time_stamp": "u2"})
data = data.merge(temp, on="user_id", how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2
0,34176,3906,0.0,train,451,47
1,34176,121,0.0,train,451,47
2,34176,4356,1.0,train,451,47


In [11]:
# 统计交互过的商品、品类、品牌、商家数
temp = (
    groups[["item_id", "cat_id", "merchant_id", "brand_id"]]
    .nunique()
    .reset_index()
    .rename(columns={"item_id": "u3", "cat_id": "u4", "merchant_id": "u5", "brand_id": "u6"})
)
data = data.merge(temp, on="user_id", how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6
0,34176,3906,0.0,train,451,47,256,45,109,108
1,34176,121,0.0,train,451,47,256,45,109,108
2,34176,4356,1.0,train,451,47,256,45,109,108


In [12]:
# 统计点击、加购物车、购买、收藏的操作次数
temp = (
    groups["action_type"].value_counts().unstack().reset_index().rename(columns={0: "u7", 1: "u8", 2: "u9", 3: "u10"})
)
data = data.merge(temp, on="user_id", how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0


In [13]:
# 统计购买点击比
data["u11"] = data["u9"] / data["u7"]
# 复购率 = 复购过的商家数/购买过的总商家数
# 按user_id,merchant_id分组，购买天数>1则复购标记为1，反之为0
groups_rb = data_user_log[data_user_log["action_type"] == 2].groupby(["user_id", "merchant_id"])
temp_rb = groups_rb.time_stamp.nunique().reset_index().rename(columns={"time_stamp": "n_days"})
temp_rb["label_um"] = [(1 if x > 1 else 0) for x in temp_rb["n_days"]]

# 与data进行匹配
temp = temp_rb.groupby(["user_id", "label_um"]).size().unstack(fill_value=0).reset_index()
temp["u12"] = temp[1] / (temp[0] + temp[1])

data = data.merge(temp[["user_id", "u12"]], on="user_id", how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455


In [14]:
data_user_log.head(10)

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0
5,328862,623866,1271,2882,2661.0,829,0
6,328862,542871,1467,2882,2661.0,829,0
7,328862,536347,1095,883,1647.0,829,0
8,328862,364513,1271,2882,2661.0,829,0
9,328862,575153,1271,2882,2661.0,829,0


In [15]:
# 性别、年龄独热编码处理
data = data.merge(data_user_info, on="user_id", how="left")

temp = pd.get_dummies(data["age_range"], prefix="age")
temp2 = pd.get_dummies(data["gender"], prefix="gender")

data = pd.concat([data, temp, temp2], axis=1)
data.drop(columns=["age_range", "gender"], inplace=True)
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,age_0.0,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,age_6.0,age_7.0,age_8.0,gender_0.0,gender_1.0,gender_2.0
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455,False,False,False,False,False,False,True,False,False,True,False,False
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455,False,False,False,False,False,False,True,False,False,True,False,False
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455,False,False,False,False,False,False,True,False,False,True,False,False


In [16]:
# 按merchant_id分组
groups = data_user_log.groupby(["merchant_id"])
# 统计交互总次数
temp = groups.size().reset_index().rename(columns={0: "m1"})
data = pd.merge(data, temp, on="merchant_id", how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,age_0.0,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,age_6.0,age_7.0,age_8.0,gender_0.0,gender_1.0,gender_2.0,m1
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455,False,False,False,False,False,False,True,False,False,True,False,False,16269
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455,False,False,False,False,False,False,True,False,False,True,False,False,79865
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455,False,False,False,False,False,False,True,False,False,True,False,False,7269


In [17]:
# 统计交互天数
temp = groups.time_stamp.nunique().reset_index().rename(columns={"time_stamp": "m2"})
data = data.merge(temp, on="merchant_id", how="left")
data.head(3)


Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,age_0.0,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,age_6.0,age_7.0,age_8.0,gender_0.0,gender_1.0,gender_2.0,m1,m2
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455,False,False,False,False,False,False,True,False,False,True,False,False,16269,185
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455,False,False,False,False,False,False,True,False,False,True,False,False,79865,185
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,0.045455,False,False,False,False,False,False,True,False,False,True,False,False,7269,155


In [18]:
# 统计交互过的商品、品类、品牌、用户数
temp = (
    groups[["item_id", "cat_id", "user_id", "brand_id"]]
    .nunique()
    .reset_index()
    .rename(columns={"item_id": "m3", "cat_id": "m4", "user_id": "m5", "brand_id": "m6"})
)
data = data.merge(temp, on="merchant_id", how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,...,age_3.0,age_4.0,age_5.0,age_6.0,age_7.0,age_8.0,gender_0.0,gender_1.0,gender_2.0,m1,m2,m3,m4,m5,m6
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,False,False,True,False,False,True,False,False,16269,185,308,20,5819,1
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,False,False,True,False,False,True,False,False,79865,185,1179,26,10931,1
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,False,False,True,False,False,True,False,False,7269,155,67,15,2281,1


In [19]:
# 统计点击、加购物车、购买、收藏的操作次数
temp = (
    groups["action_type"].value_counts().unstack().reset_index().rename(columns={0: "m7", 1: "m8", 2: "m9", 3: "m10"})
)
data = data.merge(temp, on="merchant_id", how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,...,age_7.0,age_8.0,gender_0.0,gender_1.0,gender_2.0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,False,True,False,False,16269,185,308,20,5819,1,14870.0,28.0,410.0,961.0
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,False,True,False,False,79865,185,1179,26,10931,1,72265.0,121.0,4780.0,2699.0
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,False,True,False,False,7269,155,67,15,2281,1,6094.0,16.0,963.0,196.0


In [20]:
# 统计购买点击比
data["m11"] = data["m9"] / data["m7"]
# 复购率 = 复购过的用户数/购买过的总用户数
# 按user_id,merchant_id分组，购买天数>1则复购标记为1，反之为0（在上一步已计算）
# 与data进行匹配
temp = temp_rb.groupby(["merchant_id", "label_um"]).size().unstack(fill_value=0).reset_index()
temp["m12"] = temp[1] / (temp[0] + temp[1])

data = data.merge(temp[["merchant_id", "m12"]], on="merchant_id", how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,...,gender_0.0,gender_1.0,gender_2.0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,True,False,False,16269,185,308,20,5819,1,14870.0,28.0,410.0,961.0,0.027572,0.048387
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,True,False,False,79865,185,1179,26,10931,1,72265.0,121.0,4780.0,2699.0,0.066145,0.053014
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,True,False,False,7269,155,67,15,2281,1,6094.0,16.0,963.0,196.0,0.158024,0.084444


In [21]:
# 按user_id,merchant_id分组
groups = data_user_log.groupby(["user_id", "merchant_id"])
# 统计交互总次数
temp = groups.size().reset_index().rename(columns={0: "um1"})
data = pd.merge(data, temp, on=["merchant_id", "user_id"], how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,...,gender_1.0,gender_2.0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,um1
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,False,16269,185,308,20,5819,1,14870.0,28.0,410.0,961.0,0.027572,0.048387,39
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,False,79865,185,1179,26,10931,1,72265.0,121.0,4780.0,2699.0,0.066145,0.053014,14
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,False,7269,155,67,15,2281,1,6094.0,16.0,963.0,196.0,0.158024,0.084444,18


In [22]:
# 统计交互天数
temp = groups.time_stamp.nunique().reset_index().rename(columns={"time_stamp": "um2"})
data = data.merge(temp, on=["merchant_id", "user_id"], how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,...,gender_2.0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,um1,um2
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,16269,185,308,20,5819,1,14870.0,28.0,410.0,961.0,0.027572,0.048387,39,9
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,79865,185,1179,26,10931,1,72265.0,121.0,4780.0,2699.0,0.066145,0.053014,14,3
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,False,7269,155,67,15,2281,1,6094.0,16.0,963.0,196.0,0.158024,0.084444,18,2


In [23]:
# 统计交互过的商品、品类、品牌数
temp = (
    groups[["item_id", "cat_id", "brand_id"]]
    .nunique()
    .reset_index()
    .rename(columns={"item_id": "um3", "cat_id": "um4", "brand_id": "um5"})
)
data = data.merge(temp, on=["merchant_id", "user_id"], how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,...,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,um1,um2,um3,um4,um5
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,308,20,5819,1,14870.0,28.0,410.0,961.0,0.027572,0.048387,39,9,20,6,1
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,1179,26,10931,1,72265.0,121.0,4780.0,2699.0,0.066145,0.053014,14,3,1,1,1
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,67,15,2281,1,6094.0,16.0,963.0,196.0,0.158024,0.084444,18,2,2,1,1


In [24]:
# 统计点击、加购物车、购买、收藏的操作次数
temp = (
    groups["action_type"]
    .value_counts()
    .unstack()
    .reset_index()
    .rename(columns={0: "um6", 1: "um7", 2: "um8", 3: "um9"})
)
data = data.merge(temp, on=["merchant_id", "user_id"], how="left")
data.head(3)

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,...,m7,m8,m9,m10,m11,m12,um1,um2,um3,um4,um5,um6,um7,um8,um9
0,34176,3906,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,14870.0,28.0,410.0,961.0,0.027572,0.048387,39,9,20,6,1,36.0,,1.0,2.0
1,34176,121,0.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,72265.0,121.0,4780.0,2699.0,0.066145,0.053014,14,3,1,1,1,13.0,,1.0,
2,34176,4356,1.0,train,451,47,256,45,109,108,410.0,,34.0,7.0,0.082927,...,6094.0,16.0,963.0,196.0,0.158024,0.084444,18,2,2,1,1,12.0,,6.0,


In [25]:
data["um10"] = data["um8"] / data["um6"]  # 购买/点击
# data["um11"] = data["um8"] / data["um7"]  # 购买/加购


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522341 entries, 0 to 522340
Data columns (total 50 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      522341 non-null  int64  
 1   merchant_id  522341 non-null  int64  
 2   label        260864 non-null  float64
 3   origin       522341 non-null  object 
 4   u1           522341 non-null  int64  
 5   u2           522341 non-null  int64  
 6   u3           522341 non-null  int64  
 7   u4           522341 non-null  int64  
 8   u5           522341 non-null  int64  
 9   u6           522341 non-null  int64  
 10  u7           521981 non-null  float64
 11  u8           38179 non-null   float64
 12  u9           522341 non-null  float64
 13  u10          294859 non-null  float64
 14  u11          521981 non-null  float64
 15  u12          522341 non-null  float64
 16  age_0.0      522341 non-null  bool   
 17  age_1.0      522341 non-null  bool   
 18  age_2.0      522341 non-

In [27]:
fcols = data.select_dtypes("float64").columns
icols = data.select_dtypes("int64").columns
data[fcols] = data[fcols].apply(pd.to_numeric, downcast="float")
data[icols] = data[icols].apply(pd.to_numeric, downcast="integer")

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522341 entries, 0 to 522340
Data columns (total 50 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      522341 non-null  int32  
 1   merchant_id  522341 non-null  int16  
 2   label        260864 non-null  float32
 3   origin       522341 non-null  object 
 4   u1           522341 non-null  int16  
 5   u2           522341 non-null  int16  
 6   u3           522341 non-null  int16  
 7   u4           522341 non-null  int16  
 8   u5           522341 non-null  int16  
 9   u6           522341 non-null  int16  
 10  u7           521981 non-null  float32
 11  u8           38179 non-null   float32
 12  u9           522341 non-null  float32
 13  u10          294859 non-null  float32
 14  u11          521981 non-null  float32
 15  u12          522341 non-null  float32
 16  age_0.0      522341 non-null  bool   
 17  age_1.0      522341 non-null  bool   
 18  age_2.0      522341 non-

In [28]:
data.fillna(0, inplace=True)
# 拆分train、test数据集
train = data[data["origin"] == "train"].drop(["origin"], axis=1)
test = data[data["origin"] == "test"].drop(["origin", "label"], axis=1)
X, Y = train.drop(["label"], axis=1), train["label"]

# 拆分训练集与验证集
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split

train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.2)

In [29]:
def make_predictions(cur_model, model_name):
    prob = cur_model.predict_proba(test)[:, 1]
    # 保存最优模型预测结果
    submission = pd.DataFrame()
    submission[["user_id", "merchant_id"]] = test[["user_id", "merchant_id"]]
    submission["prob"] = prob
    submission.to_csv(f"../data/output/{model_name}_submission.csv", index=False)

In [30]:
from sklearn.ensemble import RandomForestClassifier

# 使用默认参数建模
# v2: 0.6354
# v3: 0.6427330871585831
model = RandomForestClassifier()
model.fit(train_x, train_y)
auc_rf = roc_auc_score(valid_y, model.predict_proba(valid_x)[:, 1])
print("accuracy：", model.score(valid_x, valid_y))
print("roc_auc：", auc_rf)

accuracy： 0.9387230943208172
roc_auc： 0.6449265926411549


In [31]:
grid_search = GridSearchCV(
    cv=3,
    estimator=RandomForestClassifier(),
    param_grid={
        "max_depth": [100],
        "max_samples": [0.5],
        "min_samples_leaf": [100],
        "min_samples_split": [2, 10, 500],
        "n_estimators": [100, 200],
    },
    scoring="roc_auc",
)

grid_search.fit(train_x, train_y)

# 调参后的最优参数结果
display(grid_search.best_params_)
display(grid_search.best_score_)

{'max_depth': 100,
 'max_samples': 0.5,
 'min_samples_leaf': 100,
 'min_samples_split': 2,
 'n_estimators': 200}

np.float64(0.6767499484171674)

In [32]:
{"max_depth": 100, "max_samples": 0.5, "min_samples_leaf": 100, "min_samples_split": 10, "n_estimators": 200}
# 0.6749089146228416
# v2: 0.6720945176325194
# v3: 0.673459432854568

# 二次计算时，直接使用最优参数建模
model = RandomForestClassifier(
    max_depth=100, max_samples=0.5, min_samples_leaf=100, min_samples_split=10, n_estimators=100
)
model.fit(train_x, train_y)


auc_rf = roc_auc_score(valid_y, model.predict_proba(valid_x)[:, 1])
print("accuracy：", model.score(valid_x, valid_y))
print("roc_auc：", auc_rf)

accuracy： 0.9388955973396201
roc_auc： 0.6736516594266367


In [33]:
from lightgbm import LGBMClassifier

# 使用默认参数建模
# v1: roc_auc： 0.6913892506145419
# v2: roc_auc： 0.6791848668767119
# v3: 0.6779894022863103
model = LGBMClassifier()
model.fit(train_x, train_y)
auc_lgbm = roc_auc_score(valid_y, model.predict_proba(valid_x)[:, 1])
print("accuracy：", model.score(valid_x, valid_y))
print("roc_auc：", auc_lgbm)

[LightGBM] [Info] Number of positive: 12764, number of negative: 195927
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010807 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6343
[LightGBM] [Info] Number of data points in the train set: 208691, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061162 -> initscore=-2.731113
[LightGBM] [Info] Start training from score -2.731113
accuracy： 0.9388572633354417
roc_auc： 0.6781669458386679


In [34]:
{
    "boosting_type": "dart",
    "learning_rate": 0.05,
    "max_depth": 10,
    "min_split_gain": 0.05,
    "n_estimators": 1000,
    "num_leaves": 30,
    "subsample": 0.5,
}
# 0.6820475138840937
# v2: roc_auc： 0.6818937751380304
# v3: 0.6813793261958209
# evaluate the model
# model = grid_search.best_estimator_

model = LGBMClassifier(
    boosting_type="dart",
    learning_rate=0.05,
    max_depth=10,
    min_split_gain=0.05,
    n_estimators=1000,
    num_leaves=30,
    subsample=0.5,
)
model.fit(train_x, train_y)


auc_lgbm = roc_auc_score(valid_y, model.predict_proba(valid_x)[:, 1])
print("accuracy：", model.score(valid_x, valid_y))
print("roc_auc：", auc_lgbm)

model.fit(X, Y)
make_predictions(model, "lgbm_v3")

[LightGBM] [Info] Number of positive: 12764, number of negative: 195927
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6343
[LightGBM] [Info] Number of data points in the train set: 208691, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061162 -> initscore=-2.731113
[LightGBM] [Info] Start training from score -2.731113
accuracy： 0.9388955973396201
roc_auc： 0.6833694961290099
[LightGBM] [Info] Number of positive: 15952, number of negative: 244912
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6336
[LightGBM] [Info] Number of data points in the train set: 260864, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061151 -> initscore=-2.731315
[Ligh

In [35]:
def make_predictions(cur_model, model_name):
    prob = cur_model.predict_proba(test)[:, 1]
    # 保存最优模型预测结果
    submission = pd.DataFrame()
    submission[["user_id", "merchant_id"]] = test[["user_id", "merchant_id"]]
    submission["prob"] = prob
    submission.to_csv(f"../data/output/{model_name}_submission.csv", index=False)

In [36]:
make_predictions(model, "lgbm_v3")

In [37]:
from xgboost import XGBClassifier

# v1: roc_auc： 0.6920730595093996
# v2:  0.6792565418549859
model = XGBClassifier(
    eta=0.1, gamma=5, max_depth=50, min_child_weight=100, objective="binary:logistic", eval_metric="auc", subsample=0.5
)
model.fit(train_x, train_y)


auc_xgb = roc_auc_score(valid_y, model.predict_proba(valid_x)[:, 1])
print("accuracy：", model.score(valid_x, valid_y))
print("roc_auc：", auc_xgb)

model.fit(X, Y)

make_predictions(model, "xgb_v3")

accuracy： 0.9388955973396201
roc_auc： 0.6796795174155814


In [38]:
from sklearn.ensemble import RandomForestClassifier

# v1: roc_auc： 0.685645592378436
# v2: roc_auc： 0.6720845192538988

model = RandomForestClassifier(max_depth=100, min_samples_leaf=50, min_samples_split=10)
model.fit(train_x, train_y)


auc_rf = roc_auc_score(valid_y, model.predict_proba(valid_x)[:, 1])
print("accuracy：", model.score(valid_x, valid_y))
print("roc_auc：", auc_rf)

accuracy： 0.9388955973396201
roc_auc： 0.6779544643336264


In [39]:
from catboost import CatBoostClassifier, Pool

cat_features = [i for i, c in enumerate(train_x.columns) if str(train_x[c].dtype) in ("object", "category", "bool")]
# v1: 0.6925921178402836
print("Categorical feature count:", len(cat_features))
train_pool = Pool(train_x, label=train_y, cat_features=cat_features)
valid_pool = Pool(valid_x, label=valid_y, cat_features=cat_features)
# 0.6926736295
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "learning_rate": 0.05,
    "depth": 8,
    "l2_leaf_reg": 5,
    "iterations": 3000,
    "random_seed": 42,
    "od_type": "Iter",
    "od_wait": 200,  # 早停
    # "bootstrap_type": "Poisson",
    "task_type": "CPU",  # 有 GPU 可改为 "GPU"
    "verbose": 200,
}

# #
# params = {
#     "loss_function": "Logloss",
#     "eval_metric": "AUC",
#     "learning_rate": 0.07,
#     "depth": 7,
#     "l2_leaf_reg": 5,
#     "iterations": 3000,
#     "random_seed": 42,
#     "od_type": "Iter",
#     "od_wait": 200,          # 早停
#     # "bootstrap_type": "Poisson",
#     "task_type": "CPU",      # 有 GPU 可改为 "GPU"
#     "verbose": 200,
# }

model_cb = CatBoostClassifier(**params)
model_cb.fit(train_pool, eval_set=valid_pool, use_best_model=True)

pred_valid_proba = model_cb.predict_proba(valid_x)[:, 1]
auc_cb = roc_auc_score(valid_y, pred_valid_proba)
print("CatBoost AUC:", auc_cb)

model.fit(X, Y)
make_predictions(model_cb, "catboost_v3")

Categorical feature count: 12
0:	test: 0.6050367	best: 0.6050367 (0)	total: 195ms	remaining: 9m 46s
200:	test: 0.6795919	best: 0.6795919 (200)	total: 4.65s	remaining: 1m 4s
400:	test: 0.6832238	best: 0.6833218 (372)	total: 9.02s	remaining: 58.5s
600:	test: 0.6840276	best: 0.6841264 (585)	total: 13.5s	remaining: 53.9s
800:	test: 0.6832333	best: 0.6841291 (622)	total: 17.9s	remaining: 49.2s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.6841290685
bestIteration = 622

Shrink model to first 623 iterations.
CatBoost AUC: 0.684129068522628


In [40]:
xgb_result = pd.read_csv("../data/output/xgb_v3_submission.csv")
lgbm_result = pd.read_csv("../data/output/lgbm_v3_submission.csv")
cat_result = pd.read_csv("../data/output/catboost_v3_submission.csv")

final_preds = (xgb_result["prob"] * .3 + lgbm_result["prob"] * .4 + cat_result["prob"] * .3) / 3

finally_submission = pd.DataFrame()
finally_submission[["user_id", "merchant_id"]] = xgb_result[["user_id", "merchant_id"]]
finally_submission["prob"] = final_preds
finally_submission.to_csv("../data/output/finally_submission_v3.csv", index=False)